LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
67 "amdgpu-expert-scheduling-mode",
68 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69 cl::init(false), cl::Hidden);
70
71namespace {
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emitted.
75
76enum InstCounterType {
77 LOAD_CNT = 0, // VMcnt prior to gfx12.
78 DS_CNT, // LKGMcnt prior to gfx12.
79 EXP_CNT, //
80 STORE_CNT, // VScnt in gfx10/gfx11.
81 NUM_NORMAL_INST_CNTS,
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
83 BVH_CNT, // gfx12+ only.
84 KM_CNT, // gfx12+ only.
85 X_CNT, // gfx1250.
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
88 VM_VSRC, // gfx12+ expert mode only.
89 NUM_EXPERT_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
91};
92} // namespace
93
94namespace llvm {
95template <> struct enum_iteration_traits<InstCounterType> {
96 static constexpr bool is_iterable = true;
97};
98} // namespace llvm
99
100namespace {
101// Return an iterator over all counters between LOAD_CNT (the first counter)
102// and \c MaxCounter (exclusive, default value yields an enumeration over
103// all counters).
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
106}
107
108// Get the maximum wait count value for a given counter type.
109static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
110 InstCounterType T) {
111 switch (T) {
112 case LOAD_CNT:
113 return Limits.LoadcntMax;
114 case DS_CNT:
115 return Limits.DscntMax;
116 case EXP_CNT:
117 return Limits.ExpcntMax;
118 case STORE_CNT:
119 return Limits.StorecntMax;
120 case SAMPLE_CNT:
121 return Limits.SamplecntMax;
122 case BVH_CNT:
123 return Limits.BvhcntMax;
124 case KM_CNT:
125 return Limits.KmcntMax;
126 case X_CNT:
127 return Limits.XcntMax;
128 case VA_VDST:
129 return Limits.VaVdstMax;
130 case VM_VSRC:
131 return Limits.VmVsrcMax;
132 default:
133 return 0;
134 }
135}
136
137/// Integer IDs used to track vector memory locations we may have to wait on.
138/// Encoded as u16 chunks:
139///
140/// [0, REGUNITS_END ): MCRegUnit
141/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
142///
143/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
144/// It gives (2 << 16) - 1 entries per category which is more than enough
145/// for all register units. MCPhysReg is u16 so we don't even support >u16
146/// physical register numbers at this time, let alone >u16 register units.
147/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
148/// is enough for all register units.
149using VMEMID = uint32_t;
150
151enum : VMEMID {
152 TRACKINGID_RANGE_LEN = (1 << 16),
153
154 // Important: MCRegUnits must always be tracked starting from 0, as we
155 // need to be able to convert between a MCRegUnit and a VMEMID freely.
156 REGUNITS_BEGIN = 0,
157 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
158
159 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
160 // entry, which is updated for all LDS DMA operations encountered.
161 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
162 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
163 LDSDMA_BEGIN = REGUNITS_END,
164 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
165};
166
167/// Convert a MCRegUnit to a VMEMID.
168static constexpr VMEMID toVMEMID(MCRegUnit RU) {
169 return static_cast<unsigned>(RU);
170}
171
172#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
173 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
174 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
175 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
176 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
177 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
178 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
179 DECL(VMEM_GROUP) /* vmem group */ \
180 DECL(LDS_ACCESS) /* lds read & write */ \
181 DECL(GDS_ACCESS) /* gds read & write */ \
182 DECL(SQ_MESSAGE) /* send message */ \
183 DECL(SCC_WRITE) /* write to SCC from barrier */ \
184 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
185 DECL(SMEM_GROUP) /* scalar-memory group */ \
186 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
187 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
188 DECL(EXP_POS_ACCESS) /* write to export position */ \
189 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
190 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
191 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
192 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
193 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
194 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
195 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
196 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
197 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
198 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
199
200// clang-format off
201#define AMDGPU_EVENT_ENUM(Name) Name,
202enum WaitEventType {
204 NUM_WAIT_EVENTS
205};
206#undef AMDGPU_EVENT_ENUM
207
208#define AMDGPU_EVENT_NAME(Name) #Name,
209static constexpr StringLiteral WaitEventTypeName[] = {
211};
212#undef AMDGPU_EVENT_NAME
213// clang-format on
214
215// Enumerate different types of result-returning VMEM operations. Although
216// s_waitcnt orders them all with a single vmcnt counter, in the absence of
217// s_waitcnt only instructions of the same VmemType are guaranteed to write
218// their results in order -- so there is no need to insert an s_waitcnt between
219// two instructions of the same type that write the same vgpr.
220enum VmemType {
221 // BUF instructions and MIMG instructions without a sampler.
222 VMEM_NOSAMPLER,
223 // MIMG instructions with a sampler.
224 VMEM_SAMPLER,
225 // BVH instructions
226 VMEM_BVH,
227 NUM_VMEM_TYPES
228};
229
230// Maps values of InstCounterType to the instruction that waits on that
231// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
232// returns true, and does not cover VA_VDST or VM_VSRC.
233static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
234 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
235 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
236 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
237
238static bool updateVMCntOnly(const MachineInstr &Inst) {
239 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
241}
242
243#ifndef NDEBUG
244static bool isNormalMode(InstCounterType MaxCounter) {
245 return MaxCounter == NUM_NORMAL_INST_CNTS;
246}
247#endif // NDEBUG
248
249VmemType getVmemType(const MachineInstr &Inst) {
250 assert(updateVMCntOnly(Inst));
251 if (!SIInstrInfo::isImage(Inst))
252 return VMEM_NOSAMPLER;
254 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
256
257 if (BaseInfo->BVH)
258 return VMEM_BVH;
259
260 // We have to make an additional check for isVSAMPLE here since some
261 // instructions don't have a sampler, but are still classified as sampler
262 // instructions for the purposes of e.g. waitcnt.
263 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
264 return VMEM_SAMPLER;
265
266 return VMEM_NOSAMPLER;
267}
268
269unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
270 switch (T) {
271 case LOAD_CNT:
272 return Wait.LoadCnt;
273 case EXP_CNT:
274 return Wait.ExpCnt;
275 case DS_CNT:
276 return Wait.DsCnt;
277 case STORE_CNT:
278 return Wait.StoreCnt;
279 case SAMPLE_CNT:
280 return Wait.SampleCnt;
281 case BVH_CNT:
282 return Wait.BvhCnt;
283 case KM_CNT:
284 return Wait.KmCnt;
285 case X_CNT:
286 return Wait.XCnt;
287 case VA_VDST:
288 return Wait.VaVdst;
289 case VM_VSRC:
290 return Wait.VmVsrc;
291 default:
292 llvm_unreachable("bad InstCounterType");
293 }
294}
295
296void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
297 unsigned &WC = getCounterRef(Wait, T);
298 WC = std::min(WC, Count);
299}
300
301void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
302 getCounterRef(Wait, T) = ~0u;
303}
304
305unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
306 return getCounterRef(Wait, T);
307}
308
309// Mapping from event to counter according to the table masks.
310InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
311 for (auto T : inst_counter_types()) {
312 if (masks[T] & (1 << E))
313 return T;
314 }
315 llvm_unreachable("event type has no associated counter");
316}
317
318class WaitcntBrackets;
319
320// This abstracts the logic for generating and updating S_WAIT* instructions
321// away from the analysis that determines where they are needed. This was
322// done because the set of counters and instructions for waiting on them
323// underwent a major shift with gfx12, sufficiently so that having this
324// abstraction allows the main analysis logic to be simpler than it would
325// otherwise have had to become.
326class WaitcntGenerator {
327protected:
328 const GCNSubtarget *ST = nullptr;
329 const SIInstrInfo *TII = nullptr;
330 AMDGPU::IsaVersion IV;
331 InstCounterType MaxCounter;
332 bool OptNone;
333 bool ExpandWaitcntProfiling = false;
334 const AMDGPU::HardwareLimits *Limits = nullptr;
335
336public:
337 WaitcntGenerator() = default;
338 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
339 const AMDGPU::HardwareLimits *Limits)
340 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
341 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
342 OptNone(MF.getFunction().hasOptNone() ||
343 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
344 ExpandWaitcntProfiling(
345 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
346 Limits(Limits) {}
347
348 // Return true if the current function should be compiled with no
349 // optimization.
350 bool isOptNone() const { return OptNone; }
351
352 const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
353
354 // Edits an existing sequence of wait count instructions according
355 // to an incoming Waitcnt value, which is itself updated to reflect
356 // any new wait count instructions which may need to be generated by
357 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
358 // were made.
359 //
360 // This editing will usually be merely updated operands, but it may also
361 // delete instructions if the incoming Wait value indicates they are not
362 // needed. It may also remove existing instructions for which a wait
363 // is needed if it can be determined that it is better to generate new
364 // instructions later, as can happen on gfx12.
365 virtual bool
366 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
367 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
369
370 // Transform a soft waitcnt into a normal one.
371 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
372
373 // Generates new wait count instructions according to the value of
374 // Wait, returning true if any new instructions were created.
375 // If ScoreBrackets is provided, it can be used for profiling expansion.
376 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
378 AMDGPU::Waitcnt Wait,
379 WaitcntBrackets *ScoreBrackets = nullptr) = 0;
380
381 // Returns an array of bit masks which can be used to map values in
382 // WaitEventType to corresponding counter values in InstCounterType.
383 virtual const unsigned *getWaitEventMask() const = 0;
384
385 // Returns a new waitcnt with all counters except VScnt set to 0. If
386 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
387 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
388
389 virtual ~WaitcntGenerator() = default;
390
391 // Create a mask value from the initializer list of wait event types.
392 static constexpr unsigned
393 eventMask(std::initializer_list<WaitEventType> Events) {
394 unsigned Mask = 0;
395 for (auto &E : Events)
396 Mask |= 1 << E;
397
398 return Mask;
399 }
400};
401
402class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
403public:
404 WaitcntGeneratorPreGFX12() = default;
405 WaitcntGeneratorPreGFX12(const MachineFunction &MF,
406 const AMDGPU::HardwareLimits *Limits)
407 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
408
409 bool
410 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
411 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
412 MachineBasicBlock::instr_iterator It) const override;
413
414 bool createNewWaitcnt(MachineBasicBlock &Block,
416 AMDGPU::Waitcnt Wait,
417 WaitcntBrackets *ScoreBrackets = nullptr) override;
418
419 const unsigned *getWaitEventMask() const override {
420 assert(ST);
421
422 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
423 eventMask(
424 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
425 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
426 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
427 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
428 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
429 0,
430 0,
431 0,
432 0,
433 0,
434 0};
435
436 return WaitEventMaskForInstPreGFX12;
437 }
438
439 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
440};
441
442class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
443protected:
444 bool IsExpertMode;
445
446public:
447 WaitcntGeneratorGFX12Plus() = default;
448 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
449 InstCounterType MaxCounter,
450 const AMDGPU::HardwareLimits *Limits,
451 bool IsExpertMode)
452 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
453
454 bool
455 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
456 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
457 MachineBasicBlock::instr_iterator It) const override;
458
459 bool createNewWaitcnt(MachineBasicBlock &Block,
461 AMDGPU::Waitcnt Wait,
462 WaitcntBrackets *ScoreBrackets = nullptr) override;
463
464 const unsigned *getWaitEventMask() const override {
465 assert(ST);
466
467 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
468 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
469 eventMask({LDS_ACCESS, GDS_ACCESS}),
470 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
471 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
472 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
473 eventMask({VMEM_SAMPLER_READ_ACCESS}),
474 eventMask({VMEM_BVH_READ_ACCESS}),
475 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
476 eventMask({VMEM_GROUP, SMEM_GROUP}),
477 eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
478 VGPR_XDL_WRITE}),
479 eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
480
481 return WaitEventMaskForInstGFX12Plus;
482 }
483
484 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
485};
486
487// Flags indicating which counters should be flushed in a loop preheader.
488struct PreheaderFlushFlags {
489 bool FlushVmCnt = false;
490 bool FlushDsCnt = false;
491};
492
493class SIInsertWaitcnts {
494public:
495 const GCNSubtarget *ST;
496 const SIInstrInfo *TII = nullptr;
497 const SIRegisterInfo *TRI = nullptr;
498 const MachineRegisterInfo *MRI = nullptr;
499 InstCounterType SmemAccessCounter;
500 InstCounterType MaxCounter;
501 bool IsExpertMode = false;
502 const unsigned *WaitEventMaskForInst;
503
504private:
505 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
506 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
507 MachineLoopInfo *MLI;
508 MachinePostDominatorTree *PDT;
509 AliasAnalysis *AA = nullptr;
510
511 struct BlockInfo {
512 std::unique_ptr<WaitcntBrackets> Incoming;
513 bool Dirty = true;
514 };
515
516 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
517
518 bool ForceEmitWaitcnt[NUM_INST_CNTS];
519
520 // In any given run of this pass, WCG will point to one of these two
521 // generator objects, which must have been re-initialised before use
522 // from a value made using a subtarget constructor.
523 WaitcntGeneratorPreGFX12 WCGPreGFX12;
524 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
525
526 WaitcntGenerator *WCG = nullptr;
527
528 // Remember call and return instructions in the function.
529 DenseSet<MachineInstr *> CallInsts;
530 DenseSet<MachineInstr *> ReturnInsts;
531
532 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
533 // message.
534 DenseSet<MachineInstr *> ReleaseVGPRInsts;
535
536 AMDGPU::HardwareLimits Limits;
537
538public:
539 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
540 AliasAnalysis *AA)
541 : MLI(MLI), PDT(PDT), AA(AA) {
542 (void)ForceExpCounter;
543 (void)ForceLgkmCounter;
544 (void)ForceVMCounter;
545 }
546
547 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
548
549 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
550 const WaitcntBrackets &Brackets);
551 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
552 const WaitcntBrackets &ScoreBrackets);
553 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
554 bool isDSRead(const MachineInstr &MI) const;
555 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
556 bool run(MachineFunction &MF);
557
558 void setForceEmitWaitcnt() {
559// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
560// For debug builds, get the debug counter info and adjust if need be
561#ifndef NDEBUG
562 if (DebugCounter::isCounterSet(ForceExpCounter) &&
563 DebugCounter::shouldExecute(ForceExpCounter)) {
564 ForceEmitWaitcnt[EXP_CNT] = true;
565 } else {
566 ForceEmitWaitcnt[EXP_CNT] = false;
567 }
568
569 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
570 DebugCounter::shouldExecute(ForceLgkmCounter)) {
571 ForceEmitWaitcnt[DS_CNT] = true;
572 ForceEmitWaitcnt[KM_CNT] = true;
573 } else {
574 ForceEmitWaitcnt[DS_CNT] = false;
575 ForceEmitWaitcnt[KM_CNT] = false;
576 }
577
578 if (DebugCounter::isCounterSet(ForceVMCounter) &&
579 DebugCounter::shouldExecute(ForceVMCounter)) {
580 ForceEmitWaitcnt[LOAD_CNT] = true;
581 ForceEmitWaitcnt[SAMPLE_CNT] = true;
582 ForceEmitWaitcnt[BVH_CNT] = true;
583 } else {
584 ForceEmitWaitcnt[LOAD_CNT] = false;
585 ForceEmitWaitcnt[SAMPLE_CNT] = false;
586 ForceEmitWaitcnt[BVH_CNT] = false;
587 }
588
589 ForceEmitWaitcnt[VA_VDST] = false;
590 ForceEmitWaitcnt[VM_VSRC] = false;
591#endif // NDEBUG
592 }
593
594 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
595 // instruction.
596 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
597 switch (Inst.getOpcode()) {
598 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
599 case AMDGPU::GLOBAL_INV:
600 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
601 // VGPRs
602 case AMDGPU::GLOBAL_WB:
603 case AMDGPU::GLOBAL_WBINV:
604 return VMEM_WRITE_ACCESS; // tracked using storecnt
605 default:
606 break;
607 }
608
609 // Maps VMEM access types to their corresponding WaitEventType.
610 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
611 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
612
614 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
615 // these should use VM_CNT.
616 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
617 return VMEM_ACCESS;
618 if (Inst.mayStore() &&
619 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
620 if (TII->mayAccessScratch(Inst))
621 return SCRATCH_WRITE_ACCESS;
622 return VMEM_WRITE_ACCESS;
623 }
624 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
625 return VMEM_ACCESS;
626 return VmemReadMapping[getVmemType(Inst)];
627 }
628
629 std::optional<WaitEventType>
630 getExpertSchedulingEventType(const MachineInstr &Inst) const;
631
632 bool isVmemAccess(const MachineInstr &MI) const;
633 bool generateWaitcntInstBefore(MachineInstr &MI,
634 WaitcntBrackets &ScoreBrackets,
635 MachineInstr *OldWaitcntInstr,
636 PreheaderFlushFlags FlushFlags);
637 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
639 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
640 MachineInstr *OldWaitcntInstr);
641 void updateEventWaitcntAfter(MachineInstr &Inst,
642 WaitcntBrackets *ScoreBrackets);
643 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
644 MachineBasicBlock *Block) const;
645 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
646 WaitcntBrackets &ScoreBrackets);
647 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
648 WaitcntBrackets &ScoreBrackets);
649 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
650 bool ExpertMode) const;
651};
652
653// This objects maintains the current score brackets of each wait counter, and
654// a per-register scoreboard for each wait counter.
655//
656// We also maintain the latest score for every event type that can change the
657// waitcnt in order to know if there are multiple types of events within
658// the brackets. When multiple types of event happen in the bracket,
659// wait count may get decreased out of order, therefore we need to put in
660// "s_waitcnt 0" before use.
661class WaitcntBrackets {
662public:
663 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
664 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
665 }
666
667#ifndef NDEBUG
668 ~WaitcntBrackets() {
669 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
670 for (auto &[ID, Val] : VMem) {
671 if (Val.empty())
672 ++NumUnusedVmem;
673 }
674 for (auto &[ID, Val] : SGPRs) {
675 if (Val.empty())
676 ++NumUnusedSGPRs;
677 }
678
679 if (NumUnusedVmem || NumUnusedSGPRs) {
680 errs() << "WaitcntBracket had unused entries at destruction time: "
681 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
682 << " SGPR unused entries\n";
683 std::abort();
684 }
685 }
686#endif
687
688 bool isSmemCounter(InstCounterType T) const {
689 return T == Context->SmemAccessCounter || T == X_CNT;
690 }
691
692 unsigned getSgprScoresIdx(InstCounterType T) const {
693 assert(isSmemCounter(T) && "Invalid SMEM counter");
694 return T == X_CNT ? 1 : 0;
695 }
696
697 unsigned getScoreLB(InstCounterType T) const {
698 assert(T < NUM_INST_CNTS);
699 return ScoreLBs[T];
700 }
701
702 unsigned getScoreUB(InstCounterType T) const {
703 assert(T < NUM_INST_CNTS);
704 return ScoreUBs[T];
705 }
706
707 unsigned getScoreRange(InstCounterType T) const {
708 return getScoreUB(T) - getScoreLB(T);
709 }
710
711 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
712 auto It = SGPRs.find(RU);
713 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
714 }
715
716 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
717 auto It = VMem.find(TID);
718 return It != VMem.end() ? It->second.Scores[T] : 0;
719 }
720
721 bool merge(const WaitcntBrackets &Other);
722
723 bool counterOutOfOrder(InstCounterType T) const;
724 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
725 simplifyWaitcnt(Wait, Wait);
726 }
727 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
728 AMDGPU::Waitcnt &UpdateWait) const;
729 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
730 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
731 AMDGPU::Waitcnt &UpdateWait) const;
732 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
733 AMDGPU::Waitcnt &UpdateWait) const;
734
735 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
736 AMDGPU::Waitcnt &Wait) const;
737 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
738 AMDGPU::Waitcnt &Wait) const;
739 void tryClearSCCWriteEvent(MachineInstr *Inst);
740
741 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
742 void applyWaitcnt(InstCounterType T, unsigned Count);
743 void updateByEvent(WaitEventType E, MachineInstr &MI);
744
745 unsigned hasPendingEvent() const { return PendingEvents; }
746 unsigned hasPendingEvent(WaitEventType E) const {
747 return PendingEvents & (1 << E);
748 }
749 unsigned hasPendingEvent(InstCounterType T) const {
750 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
751 assert((HasPending != 0) == (getScoreRange(T) != 0));
752 return HasPending;
753 }
754
755 bool hasMixedPendingEvents(InstCounterType T) const {
756 unsigned Events = hasPendingEvent(T);
757 // Return true if more than one bit is set in Events.
758 return Events & (Events - 1);
759 }
760
761 bool hasPendingFlat() const {
762 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
763 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
764 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
765 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
766 }
767
768 void setPendingFlat() {
769 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
770 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
771 }
772
773 bool hasPendingGDS() const {
774 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
775 }
776
777 unsigned getPendingGDSWait() const {
778 return std::min(getScoreUB(DS_CNT) - LastGDS,
779 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
780 }
781
782 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
783
784 // Return true if there might be pending writes to the vgpr-interval by VMEM
785 // instructions with types different from V.
786 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
787 for (MCRegUnit RU : regunits(Reg)) {
788 auto It = VMem.find(toVMEMID(RU));
789 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
790 return true;
791 }
792 return false;
793 }
794
795 void clearVgprVmemTypes(MCPhysReg Reg) {
796 for (MCRegUnit RU : regunits(Reg)) {
797 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
798 It->second.VMEMTypes = 0;
799 if (It->second.empty())
800 VMem.erase(It);
801 }
802 }
803 }
804
805 void setStateOnFunctionEntryOrReturn() {
806 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
807 getWaitCountMax(Context->getLimits(), STORE_CNT));
808 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
809 }
810
811 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
812 return LDSDMAStores;
813 }
814
815 bool hasPointSampleAccel(const MachineInstr &MI) const;
816 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
817 MCPhysReg RU) const;
818
819 void print(raw_ostream &) const;
820 void dump() const { print(dbgs()); }
821
822 // Free up memory by removing empty entries from the DenseMap that track event
823 // scores.
824 void purgeEmptyTrackingData();
825
826private:
827 struct MergeInfo {
828 unsigned OldLB;
829 unsigned OtherLB;
830 unsigned MyShift;
831 unsigned OtherShift;
832 };
833
834 void determineWaitForScore(InstCounterType T, unsigned Score,
835 AMDGPU::Waitcnt &Wait) const;
836
837 static bool mergeScore(const MergeInfo &M, unsigned &Score,
838 unsigned OtherScore);
839
841 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
842 if (!Context->TRI->isInAllocatableClass(Reg))
843 return {{}, {}};
844 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
845 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
846 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
847 Reg = Context->TRI->get32BitRegister(Reg);
848 return Context->TRI->regunits(Reg);
849 }
850
851 void setScoreLB(InstCounterType T, unsigned Val) {
852 assert(T < NUM_INST_CNTS);
853 ScoreLBs[T] = Val;
854 }
855
856 void setScoreUB(InstCounterType T, unsigned Val) {
857 assert(T < NUM_INST_CNTS);
858 ScoreUBs[T] = Val;
859
860 if (T != EXP_CNT)
861 return;
862
863 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
864 ScoreLBs[EXP_CNT] =
865 ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
866 }
867
868 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
869 const SIRegisterInfo *TRI = Context->TRI;
870 if (Reg == AMDGPU::SCC) {
871 SCCScore = Val;
872 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
873 for (MCRegUnit RU : regunits(Reg))
874 VMem[toVMEMID(RU)].Scores[T] = Val;
875 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
876 auto STy = getSgprScoresIdx(T);
877 for (MCRegUnit RU : regunits(Reg))
878 SGPRs[RU].Scores[STy] = Val;
879 } else {
880 llvm_unreachable("Register cannot be tracked/unknown register!");
881 }
882 }
883
884 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
885 VMem[TID].Scores[T] = Val;
886 }
887
888 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
889 unsigned Val);
890
891 const SIInsertWaitcnts *Context;
892
893 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
894 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
895 unsigned PendingEvents = 0;
896 // Remember the last flat memory operation.
897 unsigned LastFlat[NUM_INST_CNTS] = {0};
898 // Remember the last GDS operation.
899 unsigned LastGDS = 0;
900
901 // The score tracking logic is fragmented as follows:
902 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
903 // - SGPRs: SGPR RegUnits
904 // - SCC: Non-allocatable and not general purpose: not a SGPR.
905 //
906 // For the VMem case, if the key is within the range of LDS DMA IDs,
907 // then the corresponding index into the `LDSDMAStores` vector below is:
908 // Key - LDSDMA_BEGIN - 1
909 // This is because LDSDMA_BEGIN is a generic entry and does not have an
910 // associated MachineInstr.
911 //
912 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
913
914 struct VMEMInfo {
915 // Scores for all instruction counters.
916 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
917 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
918 unsigned VMEMTypes = 0;
919
920 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
921 };
922
923 struct SGPRInfo {
924 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
925 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
926 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
927 // the X_CNT score.
928 std::array<unsigned, 2> Scores = {0};
929
930 bool empty() const { return !Scores[0] && !Scores[1]; }
931 };
932
933 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
934 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
935
936 // Reg score for SCC.
937 unsigned SCCScore = 0;
938 // The unique instruction that has an SCC write pending, if there is one.
939 const MachineInstr *PendingSCCWrite = nullptr;
940
941 // Store representative LDS DMA operations. The only useful info here is
942 // alias info. One store is kept per unique AAInfo.
943 SmallVector<const MachineInstr *> LDSDMAStores;
944};
945
946class SIInsertWaitcntsLegacy : public MachineFunctionPass {
947public:
948 static char ID;
949 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
950
951 bool runOnMachineFunction(MachineFunction &MF) override;
952
953 StringRef getPassName() const override {
954 return "SI insert wait instructions";
955 }
956
957 void getAnalysisUsage(AnalysisUsage &AU) const override {
958 AU.setPreservesCFG();
959 AU.addRequired<MachineLoopInfoWrapperPass>();
960 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
961 AU.addUsedIfAvailable<AAResultsWrapperPass>();
962 AU.addPreserved<AAResultsWrapperPass>();
964 }
965};
966
967} // end anonymous namespace
968
969void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
970 InstCounterType CntTy, unsigned Score) {
971 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
972}
973
974// Return true if the subtarget is one that enables Point Sample Acceleration
975// and the MachineInstr passed in is one to which it might be applied (the
976// hardware makes this decision based on several factors, but we can't determine
977// this at compile time, so we have to assume it might be applied if the
978// instruction supports it).
979bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
980 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
981 return false;
982
983 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
984 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
986 return BaseInfo->PointSampleAccel;
987}
988
989// Return true if the subtarget enables Point Sample Acceleration, the supplied
990// MachineInstr is one to which it might be applied and the supplied interval is
991// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
992// (this is the type that a point sample accelerated instruction effectively
993// becomes)
994bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
995 MCPhysReg Reg) const {
996 if (!hasPointSampleAccel(MI))
997 return false;
998
999 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1000}
1001
1002void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1003 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
1004 assert(T < Context->MaxCounter);
1005
1006 unsigned UB = getScoreUB(T);
1007 unsigned CurrScore = UB + 1;
1008 if (CurrScore == 0)
1009 report_fatal_error("InsertWaitcnt score wraparound");
1010 // PendingEvents and ScoreUB need to be update regardless if this event
1011 // changes the score of a register or not.
1012 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1013 PendingEvents |= 1 << E;
1014 setScoreUB(T, CurrScore);
1015
1016 const SIRegisterInfo *TRI = Context->TRI;
1017 const MachineRegisterInfo *MRI = Context->MRI;
1018 const SIInstrInfo *TII = Context->TII;
1019
1020 if (T == EXP_CNT) {
1021 // Put score on the source vgprs. If this is a store, just use those
1022 // specific register(s).
1023 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
1024 // All GDS operations must protect their address register (same as
1025 // export.)
1026 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1027 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1028
1029 if (Inst.mayStore()) {
1030 if (const auto *Data0 =
1031 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1032 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1033 if (const auto *Data1 =
1034 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1035 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1036 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1037 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1038 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1039 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1040 for (const MachineOperand &Op : Inst.all_uses()) {
1041 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1042 setScoreByOperand(Op, EXP_CNT, CurrScore);
1043 }
1044 }
1045 } else if (TII->isFLAT(Inst)) {
1046 if (Inst.mayStore()) {
1047 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1048 EXP_CNT, CurrScore);
1049 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1050 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1051 EXP_CNT, CurrScore);
1052 }
1053 } else if (TII->isMIMG(Inst)) {
1054 if (Inst.mayStore()) {
1055 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1056 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1057 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1058 EXP_CNT, CurrScore);
1059 }
1060 } else if (TII->isMTBUF(Inst)) {
1061 if (Inst.mayStore())
1062 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1063 } else if (TII->isMUBUF(Inst)) {
1064 if (Inst.mayStore()) {
1065 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1066 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1067 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1068 EXP_CNT, CurrScore);
1069 }
1070 } else if (TII->isLDSDIR(Inst)) {
1071 // LDSDIR instructions attach the score to the destination.
1072 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1073 EXP_CNT, CurrScore);
1074 } else {
1075 if (TII->isEXP(Inst)) {
1076 // For export the destination registers are really temps that
1077 // can be used as the actual source after export patching, so
1078 // we need to treat them like sources and set the EXP_CNT
1079 // score.
1080 for (MachineOperand &DefMO : Inst.all_defs()) {
1081 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1082 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1083 }
1084 }
1085 }
1086 for (const MachineOperand &Op : Inst.all_uses()) {
1087 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1088 setScoreByOperand(Op, EXP_CNT, CurrScore);
1089 }
1090 }
1091 } else if (T == X_CNT) {
1092 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1093 if (PendingEvents & (1 << OtherEvent)) {
1094 // Hardware inserts an implicit xcnt between interleaved
1095 // SMEM and VMEM operations. So there will never be
1096 // outstanding address translations for both SMEM and
1097 // VMEM at the same time.
1098 setScoreLB(T, getScoreUB(T) - 1);
1099 PendingEvents &= ~(1 << OtherEvent);
1100 }
1101 for (const MachineOperand &Op : Inst.all_uses())
1102 setScoreByOperand(Op, T, CurrScore);
1103 } else if (T == VA_VDST || T == VM_VSRC) {
1104 // Match the score to the VGPR destination or source registers as
1105 // appropriate
1106 for (const MachineOperand &Op : Inst.operands()) {
1107 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1108 (T == VM_VSRC && Op.isDef()))
1109 continue;
1110 if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
1111 setScoreByOperand(Op, T, CurrScore);
1112 }
1113 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1114 // Match the score to the destination registers.
1115 //
1116 // Check only explicit operands. Stores, especially spill stores, include
1117 // implicit uses and defs of their super registers which would create an
1118 // artificial dependency, while these are there only for register liveness
1119 // accounting purposes.
1120 //
1121 // Special cases where implicit register defs exists, such as M0 or VCC,
1122 // but none with memory instructions.
1123 for (const MachineOperand &Op : Inst.defs()) {
1124 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1125 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1126 continue;
1127 if (updateVMCntOnly(Inst)) {
1128 // updateVMCntOnly should only leave us with VGPRs
1129 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1130 // defs. That's required for a sane index into `VgprMemTypes` below
1131 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1132 VmemType V = getVmemType(Inst);
1133 unsigned char TypesMask = 1 << V;
1134 // If instruction can have Point Sample Accel applied, we have to flag
1135 // this with another potential dependency
1136 if (hasPointSampleAccel(Inst))
1137 TypesMask |= 1 << VMEM_NOSAMPLER;
1138 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1139 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1140 }
1141 }
1142 setScoreByOperand(Op, T, CurrScore);
1143 }
1144 if (Inst.mayStore() &&
1145 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1146 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1147 // written can be accessed. A load from LDS to VMEM does not need a wait.
1148 //
1149 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1150 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1151 // store. The "Slot" is the index into LDSDMAStores + 1.
1152 unsigned Slot = 0;
1153 for (const auto *MemOp : Inst.memoperands()) {
1154 if (!MemOp->isStore() ||
1155 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1156 continue;
1157 // Comparing just AA info does not guarantee memoperands are equal
1158 // in general, but this is so for LDS DMA in practice.
1159 auto AAI = MemOp->getAAInfo();
1160 // Alias scope information gives a way to definitely identify an
1161 // original memory object and practically produced in the module LDS
1162 // lowering pass. If there is no scope available we will not be able
1163 // to disambiguate LDS aliasing as after the module lowering all LDS
1164 // is squashed into a single big object.
1165 if (!AAI || !AAI.Scope)
1166 break;
1167 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1168 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1169 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1170 Slot = I + 1;
1171 break;
1172 }
1173 }
1174 }
1175 if (Slot)
1176 break;
1177 // The slot may not be valid because it can be >= NUM_LDSDMA which
1178 // means the scoreboard cannot track it. We still want to preserve the
1179 // MI in order to check alias information, though.
1180 LDSDMAStores.push_back(&Inst);
1181 Slot = LDSDMAStores.size();
1182 break;
1183 }
1184 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1185 if (Slot && Slot < NUM_LDSDMA)
1186 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1187 }
1188
1190 setRegScore(AMDGPU::SCC, T, CurrScore);
1191 PendingSCCWrite = &Inst;
1192 }
1193 }
1194}
1195
1196void WaitcntBrackets::print(raw_ostream &OS) const {
1197 const GCNSubtarget *ST = Context->ST;
1198
1199 OS << '\n';
1200 for (auto T : inst_counter_types(Context->MaxCounter)) {
1201 unsigned SR = getScoreRange(T);
1202
1203 switch (T) {
1204 case LOAD_CNT:
1205 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1206 << SR << "):";
1207 break;
1208 case DS_CNT:
1209 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1210 << SR << "):";
1211 break;
1212 case EXP_CNT:
1213 OS << " EXP_CNT(" << SR << "):";
1214 break;
1215 case STORE_CNT:
1216 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1217 << SR << "):";
1218 break;
1219 case SAMPLE_CNT:
1220 OS << " SAMPLE_CNT(" << SR << "):";
1221 break;
1222 case BVH_CNT:
1223 OS << " BVH_CNT(" << SR << "):";
1224 break;
1225 case KM_CNT:
1226 OS << " KM_CNT(" << SR << "):";
1227 break;
1228 case X_CNT:
1229 OS << " X_CNT(" << SR << "):";
1230 break;
1231 case VA_VDST:
1232 OS << " VA_VDST(" << SR << "): ";
1233 break;
1234 case VM_VSRC:
1235 OS << " VM_VSRC(" << SR << "): ";
1236 break;
1237 default:
1238 OS << " UNKNOWN(" << SR << "):";
1239 break;
1240 }
1241
1242 if (SR != 0) {
1243 // Print vgpr scores.
1244 unsigned LB = getScoreLB(T);
1245
1246 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1247 sort(SortedVMEMIDs);
1248
1249 for (auto ID : SortedVMEMIDs) {
1250 unsigned RegScore = VMem.at(ID).Scores[T];
1251 if (RegScore <= LB)
1252 continue;
1253 unsigned RelScore = RegScore - LB - 1;
1254 if (ID < REGUNITS_END) {
1255 OS << ' ' << RelScore << ":vRU" << ID;
1256 } else {
1257 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1258 "Unhandled/unexpected ID value!");
1259 OS << ' ' << RelScore << ":LDSDMA" << ID;
1260 }
1261 }
1262
1263 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1264 if (isSmemCounter(T)) {
1265 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1266 sort(SortedSMEMIDs);
1267 for (auto ID : SortedSMEMIDs) {
1268 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1269 if (RegScore <= LB)
1270 continue;
1271 unsigned RelScore = RegScore - LB - 1;
1272 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1273 }
1274 }
1275
1276 if (T == KM_CNT && SCCScore > 0)
1277 OS << ' ' << SCCScore << ":scc";
1278 }
1279 OS << '\n';
1280 }
1281
1282 OS << "Pending Events: ";
1283 if (hasPendingEvent()) {
1284 ListSeparator LS;
1285 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1286 if (hasPendingEvent((WaitEventType)I)) {
1287 OS << LS << WaitEventTypeName[I];
1288 }
1289 }
1290 } else {
1291 OS << "none";
1292 }
1293 OS << '\n';
1294
1295 OS << '\n';
1296}
1297
1298/// Simplify \p UpdateWait by removing waits that are redundant based on the
1299/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1300void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1301 AMDGPU::Waitcnt &UpdateWait) const {
1302 simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
1303 simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
1304 simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
1305 simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
1306 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
1307 simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
1308 simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
1309 simplifyXcnt(CheckWait, UpdateWait);
1310 simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
1311 simplifyVmVsrc(CheckWait, UpdateWait);
1312}
1313
1314void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1315 unsigned &Count) const {
1316 // The number of outstanding events for this type, T, can be calculated
1317 // as (UB - LB). If the current Count is greater than or equal to the number
1318 // of outstanding events, then the wait for this counter is redundant.
1319 if (Count >= getScoreRange(T))
1320 Count = ~0u;
1321}
1322
1323void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1324 AMDGPU::Waitcnt &UpdateWait) const {
1325 // Waiting for some counters implies waiting for VM_VSRC, since an
1326 // instruction that decrements a counter on completion would have
1327 // decremented VM_VSRC once its VGPR operands had been read.
1328 if (CheckWait.VmVsrc >=
1329 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1330 CheckWait.BvhCnt, CheckWait.DsCnt}))
1331 UpdateWait.VmVsrc = ~0u;
1332 simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
1333}
1334
1335void WaitcntBrackets::purgeEmptyTrackingData() {
1336 for (auto &[K, V] : make_early_inc_range(VMem)) {
1337 if (V.empty())
1338 VMem.erase(K);
1339 }
1340 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1341 if (V.empty())
1342 SGPRs.erase(K);
1343 }
1344}
1345
1346void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1347 unsigned ScoreToWait,
1348 AMDGPU::Waitcnt &Wait) const {
1349 const unsigned LB = getScoreLB(T);
1350 const unsigned UB = getScoreUB(T);
1351
1352 // If the score falls within the bracket, we need a waitcnt.
1353 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1354 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1355 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1356 // If there is a pending FLAT operation, and this is a VMem or LGKM
1357 // waitcnt and the target can report early completion, then we need
1358 // to force a waitcnt 0.
1359 addWait(Wait, T, 0);
1360 } else if (counterOutOfOrder(T)) {
1361 // Counter can get decremented out-of-order when there
1362 // are multiple types event in the bracket. Also emit an s_wait counter
1363 // with a conservative value of 0 for the counter.
1364 addWait(Wait, T, 0);
1365 } else {
1366 // If a counter has been maxed out avoid overflow by waiting for
1367 // MAX(CounterType) - 1 instead.
1368 unsigned NeededWait = std::min(
1369 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1370 addWait(Wait, T, NeededWait);
1371 }
1372 }
1373}
1374
1375void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1376 AMDGPU::Waitcnt &Wait) const {
1377 if (Reg == AMDGPU::SCC) {
1378 determineWaitForScore(T, SCCScore, Wait);
1379 } else {
1380 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1381 for (MCRegUnit RU : regunits(Reg))
1382 determineWaitForScore(
1383 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1384 Wait);
1385 }
1386}
1387
1388void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1389 AMDGPU::Waitcnt &Wait) const {
1390 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1391 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1392}
1393
1394void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1395 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1396 // SCC has landed
1397 if (PendingSCCWrite &&
1398 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1399 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1400 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1401 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1402 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1403 SCC_WRITE_PendingEvent) {
1404 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1405 }
1406
1407 PendingEvents &= ~SCC_WRITE_PendingEvent;
1408 PendingSCCWrite = nullptr;
1409 }
1410}
1411
1412void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1413 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1414 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1415 applyWaitcnt(DS_CNT, Wait.DsCnt);
1416 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1417 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1418 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1419 applyWaitcnt(KM_CNT, Wait.KmCnt);
1420 applyWaitcnt(X_CNT, Wait.XCnt);
1421 applyWaitcnt(VA_VDST, Wait.VaVdst);
1422 applyWaitcnt(VM_VSRC, Wait.VmVsrc);
1423}
1424
1425void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1426 const unsigned UB = getScoreUB(T);
1427 if (Count >= UB)
1428 return;
1429 if (Count != 0) {
1430 if (counterOutOfOrder(T))
1431 return;
1432 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1433 } else {
1434 setScoreLB(T, UB);
1435 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1436 }
1437
1438 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1439 if (!hasMixedPendingEvents(X_CNT))
1440 applyWaitcnt(X_CNT, 0);
1441 else
1442 PendingEvents &= ~(1 << SMEM_GROUP);
1443 }
1444 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1445 !hasPendingEvent(STORE_CNT)) {
1446 if (!hasMixedPendingEvents(X_CNT))
1447 applyWaitcnt(X_CNT, Count);
1448 else if (Count == 0)
1449 PendingEvents &= ~(1 << VMEM_GROUP);
1450 }
1451}
1452
1453void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1454 AMDGPU::Waitcnt &UpdateWait) const {
1455 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1456 // optimizations. On entry to a block with multiple predescessors, there may
1457 // be pending SMEM and VMEM events active at the same time.
1458 // In such cases, only clear one active event at a time.
1459 // TODO: Revisit xcnt optimizations for gfx1250.
1460 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1461 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1462 // zero.
1463 if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1464 UpdateWait.XCnt = ~0u;
1465 // If we have pending store we cannot optimize XCnt because we do not wait for
1466 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1467 // decremented to the same number as LOADCnt.
1468 if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1469 !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
1470 UpdateWait.XCnt = ~0u;
1471 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1472}
1473
1474// Where there are multiple types of event in the bracket of a counter,
1475// the decrement may go out of order.
1476bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1477 // Scalar memory read always can go out of order.
1478 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1479 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1480 return true;
1481
1482 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1483 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1484 // out-of-order completion.
1485 if (T == LOAD_CNT) {
1486 unsigned Events = hasPendingEvent(T);
1487 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1488 // events
1489 Events &= ~(1 << GLOBAL_INV_ACCESS);
1490 // Return true only if there are still multiple event types after removing
1491 // GLOBAL_INV
1492 return Events & (Events - 1);
1493 }
1494
1495 return hasMixedPendingEvents(T);
1496}
1497
1498INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1499 false, false)
1502INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1504
1505char SIInsertWaitcntsLegacy::ID = 0;
1506
1507char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1508
1510 return new SIInsertWaitcntsLegacy();
1511}
1512
1513static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1514 unsigned NewEnc) {
1515 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1516 assert(OpIdx >= 0);
1517
1518 MachineOperand &MO = MI.getOperand(OpIdx);
1519
1520 if (NewEnc == MO.getImm())
1521 return false;
1522
1523 MO.setImm(NewEnc);
1524 return true;
1525}
1526
1527/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1528/// and if so, which counter it is waiting on.
1529static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1530 switch (Opcode) {
1531 case AMDGPU::S_WAIT_LOADCNT:
1532 return LOAD_CNT;
1533 case AMDGPU::S_WAIT_EXPCNT:
1534 return EXP_CNT;
1535 case AMDGPU::S_WAIT_STORECNT:
1536 return STORE_CNT;
1537 case AMDGPU::S_WAIT_SAMPLECNT:
1538 return SAMPLE_CNT;
1539 case AMDGPU::S_WAIT_BVHCNT:
1540 return BVH_CNT;
1541 case AMDGPU::S_WAIT_DSCNT:
1542 return DS_CNT;
1543 case AMDGPU::S_WAIT_KMCNT:
1544 return KM_CNT;
1545 case AMDGPU::S_WAIT_XCNT:
1546 return X_CNT;
1547 default:
1548 return {};
1549 }
1550}
1551
1552bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1553 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1554 if (Opcode == Waitcnt->getOpcode())
1555 return false;
1556
1557 Waitcnt->setDesc(TII->get(Opcode));
1558 return true;
1559}
1560
1561/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1562/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1563/// from \p Wait that were added by previous passes. Currently this pass
1564/// conservatively assumes that these preexisting waits are required for
1565/// correctness.
1566bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1567 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1568 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1569 assert(ST);
1570 assert(isNormalMode(MaxCounter));
1571
1572 bool Modified = false;
1573 MachineInstr *WaitcntInstr = nullptr;
1574 MachineInstr *WaitcntVsCntInstr = nullptr;
1575
1576 LLVM_DEBUG({
1577 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1578 if (It.isEnd())
1579 dbgs() << "end of block\n";
1580 else
1581 dbgs() << *It;
1582 });
1583
1584 for (auto &II :
1585 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1586 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1587 if (II.isMetaInstruction()) {
1588 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1589 continue;
1590 }
1591
1592 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1593 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1594
1595 // Update required wait count. If this is a soft waitcnt (= it was added
1596 // by an earlier pass), it may be entirely removed.
1597 if (Opcode == AMDGPU::S_WAITCNT) {
1598 unsigned IEnc = II.getOperand(0).getImm();
1599 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1600 if (TrySimplify)
1601 ScoreBrackets.simplifyWaitcnt(OldWait);
1602 Wait = Wait.combined(OldWait);
1603
1604 // Merge consecutive waitcnt of the same type by erasing multiples.
1605 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1606 II.eraseFromParent();
1607 Modified = true;
1608 } else
1609 WaitcntInstr = &II;
1610 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1611 assert(ST->hasVMemToLDSLoad());
1612 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1613 << "Before: " << Wait << '\n';);
1614 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1615 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1616
1617 // It is possible (but unlikely) that this is the only wait instruction,
1618 // in which case, we exit this loop without a WaitcntInstr to consume
1619 // `Wait`. But that works because `Wait` was passed in by reference, and
1620 // the callee eventually calls createNewWaitcnt on it. We test this
1621 // possibility in an articial MIR test since such a situation cannot be
1622 // recreated by running the memory legalizer.
1623 II.eraseFromParent();
1624 } else {
1625 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1626 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1627
1628 unsigned OldVSCnt =
1629 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1630 if (TrySimplify)
1631 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1632 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1633
1634 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1635 II.eraseFromParent();
1636 Modified = true;
1637 } else
1638 WaitcntVsCntInstr = &II;
1639 }
1640 }
1641
1642 if (WaitcntInstr) {
1643 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1645 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1646
1647 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1648 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1649 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1650 Wait.LoadCnt = ~0u;
1651 Wait.ExpCnt = ~0u;
1652 Wait.DsCnt = ~0u;
1653
1654 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1655 << "New Instr at block end: "
1656 << *WaitcntInstr << '\n'
1657 : dbgs() << "applied pre-existing waitcnt\n"
1658 << "Old Instr: " << *It
1659 << "New Instr: " << *WaitcntInstr << '\n');
1660 }
1661
1662 if (WaitcntVsCntInstr) {
1663 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1664 AMDGPU::OpName::simm16, Wait.StoreCnt);
1665 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1666
1667 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1668 Wait.StoreCnt = ~0u;
1669
1670 LLVM_DEBUG(It.isEnd()
1671 ? dbgs() << "applied pre-existing waitcnt\n"
1672 << "New Instr at block end: " << *WaitcntVsCntInstr
1673 << '\n'
1674 : dbgs() << "applied pre-existing waitcnt\n"
1675 << "Old Instr: " << *It
1676 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1677 }
1678
1679 return Modified;
1680}
1681
1682/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1683/// required counters in \p Wait
1684bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1685 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1686 AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
1687 assert(ST);
1688 assert(isNormalMode(MaxCounter));
1689
1690 bool Modified = false;
1691 const DebugLoc &DL = Block.findDebugLoc(It);
1692
1693 // Helper to emit expanded waitcnt sequence for profiling.
1694 // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
1695 // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
1696 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1697 auto EmitWaitcnt) {
1698 if (Outstanding > Target) {
1699 for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
1700 EmitWaitcnt(i);
1701 Modified = true;
1702 }
1703 } else {
1704 EmitWaitcnt(Target);
1705 Modified = true;
1706 }
1707 };
1708
1709 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1710 // single instruction while VScnt has its own instruction.
1711 if (Wait.hasWaitExceptStoreCnt()) {
1712 // If profiling expansion is enabled and we have score brackets,
1713 // emit an expanded sequence
1714 if (ExpandWaitcntProfiling && ScoreBrackets) {
1715 // Check if any of the counters to be waited on are out-of-order.
1716 // If so, fall back to normal (non-expanded) behavior since expansion
1717 // would provide misleading profiling information.
1718 bool AnyOutOfOrder = false;
1719 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1720 unsigned &WaitCnt = getCounterRef(Wait, CT);
1721 if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
1722 AnyOutOfOrder = true;
1723 break;
1724 }
1725 }
1726
1727 if (AnyOutOfOrder) {
1728 // Fall back to non-expanded wait
1729 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1730 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1731 Modified = true;
1732 } else {
1733 // All counters are in-order, safe to expand
1734 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1735 unsigned &WaitCnt = getCounterRef(Wait, CT);
1736 if (WaitCnt == ~0u)
1737 continue;
1738
1739 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
1740 ScoreBrackets->getScoreLB(CT),
1741 getWaitCountMax(getLimits(), CT) - 1);
1742 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1743 AMDGPU::Waitcnt W;
1744 getCounterRef(W, CT) = Count;
1745 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
1747 });
1748 }
1749 }
1750 } else {
1751 // Normal behavior: emit single combined waitcnt
1752 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1753 [[maybe_unused]] auto SWaitInst =
1754 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1755 Modified = true;
1756
1757 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1758 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1759 dbgs() << "New Instr: " << *SWaitInst << '\n');
1760 }
1761 }
1762
1763 if (Wait.hasWaitStoreCnt()) {
1764 assert(ST->hasVscnt());
1765
1766 if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
1767 !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
1768 // Only expand if counter is not out-of-order
1769 unsigned Outstanding =
1770 std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
1771 ScoreBrackets->getScoreLB(STORE_CNT),
1772 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1773 EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
1774 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1775 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1776 .addImm(Count);
1777 });
1778 } else {
1779 [[maybe_unused]] auto SWaitInst =
1780 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1781 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1782 .addImm(Wait.StoreCnt);
1783 Modified = true;
1784
1785 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1786 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1787 dbgs() << "New Instr: " << *SWaitInst << '\n');
1788 }
1789 }
1790
1791 return Modified;
1792}
1793
1794AMDGPU::Waitcnt
1795WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1796 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1797}
1798
1799AMDGPU::Waitcnt
1800WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1801 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1802 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1803 ~0u /* XCNT */, ExpertVal, ExpertVal);
1804}
1805
1806/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1807/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1808/// were added by previous passes. Currently this pass conservatively
1809/// assumes that these preexisting waits are required for correctness.
1810bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1811 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1812 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1813 assert(ST);
1814 assert(!isNormalMode(MaxCounter));
1815
1816 bool Modified = false;
1817 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1818 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1819 MachineInstr *WaitcntDepctrInstr = nullptr;
1820 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1821
1822 LLVM_DEBUG({
1823 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1824 if (It.isEnd())
1825 dbgs() << "end of block\n";
1826 else
1827 dbgs() << *It;
1828 });
1829
1830 // Accumulate waits that should not be simplified.
1831 AMDGPU::Waitcnt RequiredWait;
1832
1833 for (auto &II :
1834 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1835 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1836 if (II.isMetaInstruction()) {
1837 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1838 continue;
1839 }
1840
1841 MachineInstr **UpdatableInstr;
1842
1843 // Update required wait count. If this is a soft waitcnt (= it was added
1844 // by an earlier pass), it may be entirely removed.
1845
1846 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1847 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1848
1849 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1850 // attempt to do more than that either.
1851 if (Opcode == AMDGPU::S_WAITCNT)
1852 continue;
1853
1854 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1855 unsigned OldEnc =
1856 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1857 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1858 if (TrySimplify)
1859 Wait = Wait.combined(OldWait);
1860 else
1861 RequiredWait = RequiredWait.combined(OldWait);
1862 UpdatableInstr = &CombinedLoadDsCntInstr;
1863 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1864 unsigned OldEnc =
1865 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1866 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1867 if (TrySimplify)
1868 Wait = Wait.combined(OldWait);
1869 else
1870 RequiredWait = RequiredWait.combined(OldWait);
1871 UpdatableInstr = &CombinedStoreDsCntInstr;
1872 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1873 unsigned OldEnc =
1874 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1875 AMDGPU::Waitcnt OldWait;
1876 OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
1877 OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
1878 if (TrySimplify)
1879 ScoreBrackets.simplifyWaitcnt(OldWait);
1880 Wait = Wait.combined(OldWait);
1881 UpdatableInstr = &WaitcntDepctrInstr;
1882 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1883 // Architectures higher than GFX10 do not have direct loads to
1884 // LDS, so no work required here yet.
1885 II.eraseFromParent();
1886 continue;
1887 } else {
1888 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1889 assert(CT.has_value());
1890 unsigned OldCnt =
1891 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1892 if (TrySimplify)
1893 addWait(Wait, CT.value(), OldCnt);
1894 else
1895 addWait(RequiredWait, CT.value(), OldCnt);
1896 UpdatableInstr = &WaitInstrs[CT.value()];
1897 }
1898
1899 // Merge consecutive waitcnt of the same type by erasing multiples.
1900 if (!*UpdatableInstr) {
1901 *UpdatableInstr = &II;
1902 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1903 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1904 // duplicate if it is waiting on things other than VA_VDST or
1905 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1906 // VM_VSRC subfields of the operand are set to the "no wait"
1907 // values.
1908
1909 unsigned Enc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1910 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1911 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1912
1913 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
1914 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1915 Modified |= promoteSoftWaitCnt(&II);
1916 } else {
1917 II.eraseFromParent();
1918 Modified = true;
1919 }
1920 } else {
1921 II.eraseFromParent();
1922 Modified = true;
1923 }
1924 }
1925
1926 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1927 Wait = Wait.combined(RequiredWait);
1928
1929 if (CombinedLoadDsCntInstr) {
1930 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1931 // to be waited for. Otherwise, let the instruction be deleted so
1932 // the appropriate single counter wait instruction can be inserted
1933 // instead, when new S_WAIT_*CNT instructions are inserted by
1934 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1935 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1936 // the loop below that deals with single counter instructions.
1937 //
1938 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1939 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1940 // will have needed to wait for their register sources to be available
1941 // first.
1942 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1943 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1944 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1945 AMDGPU::OpName::simm16, NewEnc);
1946 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1947 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1948 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1949 Wait.LoadCnt = ~0u;
1950 Wait.DsCnt = ~0u;
1951
1952 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1953 << "New Instr at block end: "
1954 << *CombinedLoadDsCntInstr << '\n'
1955 : dbgs() << "applied pre-existing waitcnt\n"
1956 << "Old Instr: " << *It << "New Instr: "
1957 << *CombinedLoadDsCntInstr << '\n');
1958 } else {
1959 CombinedLoadDsCntInstr->eraseFromParent();
1960 Modified = true;
1961 }
1962 }
1963
1964 if (CombinedStoreDsCntInstr) {
1965 // Similarly for S_WAIT_STORECNT_DSCNT.
1966 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1967 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1968 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1969 AMDGPU::OpName::simm16, NewEnc);
1970 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1971 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1972 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1973 Wait.StoreCnt = ~0u;
1974 Wait.DsCnt = ~0u;
1975
1976 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1977 << "New Instr at block end: "
1978 << *CombinedStoreDsCntInstr << '\n'
1979 : dbgs() << "applied pre-existing waitcnt\n"
1980 << "Old Instr: " << *It << "New Instr: "
1981 << *CombinedStoreDsCntInstr << '\n');
1982 } else {
1983 CombinedStoreDsCntInstr->eraseFromParent();
1984 Modified = true;
1985 }
1986 }
1987
1988 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1989 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1990 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1991 // instructions so that createNewWaitcnt() will create new combined
1992 // instructions to replace them.
1993
1994 if (Wait.DsCnt != ~0u) {
1995 // This is a vector of addresses in WaitInstrs pointing to instructions
1996 // that should be removed if they are present.
1998
1999 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2000 // both) need to be waited for, ensure that there are no existing
2001 // individual wait count instructions for these.
2002
2003 if (Wait.LoadCnt != ~0u) {
2004 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
2005 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2006 } else if (Wait.StoreCnt != ~0u) {
2007 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
2008 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2009 }
2010
2011 for (MachineInstr **WI : WaitsToErase) {
2012 if (!*WI)
2013 continue;
2014
2015 (*WI)->eraseFromParent();
2016 *WI = nullptr;
2017 Modified = true;
2018 }
2019 }
2020
2021 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2022 if (!WaitInstrs[CT])
2023 continue;
2024
2025 unsigned NewCnt = getWait(Wait, CT);
2026 if (NewCnt != ~0u) {
2027 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2028 AMDGPU::OpName::simm16, NewCnt);
2029 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2030
2031 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2032 setNoWait(Wait, CT);
2033
2034 LLVM_DEBUG(It.isEnd()
2035 ? dbgs() << "applied pre-existing waitcnt\n"
2036 << "New Instr at block end: " << *WaitInstrs[CT]
2037 << '\n'
2038 : dbgs() << "applied pre-existing waitcnt\n"
2039 << "Old Instr: " << *It
2040 << "New Instr: " << *WaitInstrs[CT] << '\n');
2041 } else {
2042 WaitInstrs[CT]->eraseFromParent();
2043 Modified = true;
2044 }
2045 }
2046
2047 if (WaitcntDepctrInstr) {
2048 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2049 // subfields with the new required values.
2050 unsigned Enc =
2051 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2052 ->getImm();
2053 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
2054 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2055
2056 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
2057 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
2058 Wait.VaVdst = ~0u;
2059 Wait.VmVsrc = ~0u;
2060
2061 // If that new encoded Depctr immediate would actually still wait
2062 // for anything, update the instruction's operand. Otherwise it can
2063 // just be deleted.
2064 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
2065 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2066 AMDGPU::OpName::simm16, Enc);
2067 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2068 << "New Instr at block end: "
2069 << *WaitcntDepctrInstr << '\n'
2070 : dbgs() << "applyPreexistingWaitcnt\n"
2071 << "Old Instr: " << *It << "New Instr: "
2072 << *WaitcntDepctrInstr << '\n');
2073 } else {
2074 WaitcntDepctrInstr->eraseFromParent();
2075 Modified = true;
2076 }
2077 }
2078
2079 return Modified;
2080}
2081
2082/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2083bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2084 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2085 AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
2086 assert(ST);
2087 assert(!isNormalMode(MaxCounter));
2088
2089 bool Modified = false;
2090 const DebugLoc &DL = Block.findDebugLoc(It);
2091
2092 // Helper to emit expanded waitcnt sequence for profiling.
2093 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2094 auto EmitWaitcnt) {
2095 if (Outstanding > Target) {
2096 for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
2097 EmitWaitcnt(i);
2098 Modified = true;
2099 }
2100 } else {
2101 EmitWaitcnt(Target);
2102 Modified = true;
2103 }
2104 };
2105
2106 // For GFX12+, we use separate wait instructions, which makes expansion
2107 // simpler
2108 if (ExpandWaitcntProfiling && ScoreBrackets) {
2109 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2110 unsigned Count = getWait(Wait, CT);
2111 if (Count == ~0u)
2112 continue;
2113
2114 // Skip expansion for out-of-order counters - emit normal wait instead
2115 if (ScoreBrackets->counterOutOfOrder(CT)) {
2116 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2117 .addImm(Count);
2118 Modified = true;
2119 continue;
2120 }
2121
2122 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
2123 ScoreBrackets->getScoreLB(CT),
2124 getWaitCountMax(getLimits(), CT) - 1);
2125 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2126 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2127 .addImm(Val);
2128 });
2129 }
2130 return Modified;
2131 }
2132
2133 // Normal behavior (no expansion)
2134 // Check for opportunities to use combined wait instructions.
2135 if (Wait.DsCnt != ~0u) {
2136 MachineInstr *SWaitInst = nullptr;
2137
2138 if (Wait.LoadCnt != ~0u) {
2139 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2140
2141 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2142 .addImm(Enc);
2143
2144 Wait.LoadCnt = ~0u;
2145 Wait.DsCnt = ~0u;
2146 } else if (Wait.StoreCnt != ~0u) {
2147 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2148
2149 SWaitInst =
2150 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2151 .addImm(Enc);
2152
2153 Wait.StoreCnt = ~0u;
2154 Wait.DsCnt = ~0u;
2155 }
2156
2157 if (SWaitInst) {
2158 Modified = true;
2159
2160 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2161 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2162 dbgs() << "New Instr: " << *SWaitInst << '\n');
2163 }
2164 }
2165
2166 // Generate an instruction for any remaining counter that needs
2167 // waiting for.
2168
2169 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2170 unsigned Count = getWait(Wait, CT);
2171 if (Count == ~0u)
2172 continue;
2173
2174 [[maybe_unused]] auto SWaitInst =
2175 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2176 .addImm(Count);
2177
2178 Modified = true;
2179
2180 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2181 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2182 dbgs() << "New Instr: " << *SWaitInst << '\n');
2183 }
2184
2185 if (Wait.hasWaitDepctr()) {
2186 assert(IsExpertMode);
2187 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
2188 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2189
2190 [[maybe_unused]] auto SWaitInst =
2191 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2192
2193 Modified = true;
2194
2195 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2196 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2197 dbgs() << "New Instr: " << *SWaitInst << '\n');
2198 }
2199
2200 return Modified;
2201}
2202
2203/// Generate s_waitcnt instruction to be placed before cur_Inst.
2204/// Instructions of a given type are returned in order,
2205/// but instructions of different types can complete out of order.
2206/// We rely on this in-order completion
2207/// and simply assign a score to the memory access instructions.
2208/// We keep track of the active "score bracket" to determine
2209/// if an access of a memory read requires an s_waitcnt
2210/// and if so what the value of each counter is.
2211/// The "score bracket" is bound by the lower bound and upper bound
2212/// scores (*_score_LB and *_score_ub respectively).
2213/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2214/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2215/// (GFX12+ only, where DS_CNT is a separate counter).
2216bool SIInsertWaitcnts::generateWaitcntInstBefore(
2217 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2218 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2219 setForceEmitWaitcnt();
2220
2221 assert(!MI.isMetaInstruction());
2222
2223 AMDGPU::Waitcnt Wait;
2224 const unsigned Opc = MI.getOpcode();
2225
2226 // FIXME: This should have already been handled by the memory legalizer.
2227 // Removing this currently doesn't affect any lit tests, but we need to
2228 // verify that nothing was relying on this. The number of buffer invalidates
2229 // being handled here should not be expanded.
2230 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2231 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
2232 Opc == AMDGPU::BUFFER_GL1_INV) {
2233 Wait.LoadCnt = 0;
2234 }
2235
2236 // All waits must be resolved at call return.
2237 // NOTE: this could be improved with knowledge of all call sites or
2238 // with knowledge of the called routines.
2239 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
2240 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2241 Opc == AMDGPU::S_SETPC_B64_return) {
2242 ReturnInsts.insert(&MI);
2243 AMDGPU::Waitcnt AllZeroWait =
2244 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2245 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2246 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2247 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2248 // no need to wait for it at function boundaries.
2249 if (ST->hasExtendedWaitCounts() &&
2250 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2251 AllZeroWait.LoadCnt = ~0u;
2252 Wait = Wait.combined(AllZeroWait);
2253 }
2254 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2255 // Technically the hardware will do this on its own if we don't, but that
2256 // might cost extra cycles compared to doing it explicitly.
2257 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2258 // have to wait for outstanding VMEM stores. In this case it can be useful to
2259 // send a message to explicitly release all VGPRs before the stores have
2260 // completed, but it is only safe to do this if there are no outstanding
2261 // scratch stores.
2262 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
2263 if (!WCG->isOptNone() &&
2264 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
2265 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
2266 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2267 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
2268 ReleaseVGPRInsts.insert(&MI);
2269 }
2270 // Resolve vm waits before gs-done.
2271 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
2272 ST->hasLegacyGeometry() &&
2273 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2275 Wait.LoadCnt = 0;
2276 }
2277
2278 // Export & GDS instructions do not read the EXEC mask until after the export
2279 // is granted (which can occur well after the instruction is issued).
2280 // The shader program must flush all EXP operations on the export-count
2281 // before overwriting the EXEC mask.
2282 else {
2283 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2284 // Export and GDS are tracked individually, either may trigger a waitcnt
2285 // for EXEC.
2286 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2287 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2288 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2289 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2290 Wait.ExpCnt = 0;
2291 }
2292 }
2293
2294 // Wait for any pending GDS instruction to complete before any
2295 // "Always GDS" instruction.
2296 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2297 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2298
2299 if (MI.isCall()) {
2300 // The function is going to insert a wait on everything in its prolog.
2301 // This still needs to be careful if the call target is a load (e.g. a GOT
2302 // load). We also need to check WAW dependency with saved PC.
2303 CallInsts.insert(&MI);
2304 Wait = AMDGPU::Waitcnt();
2305
2306 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2307 if (CallAddrOp.isReg()) {
2308 ScoreBrackets.determineWaitForPhysReg(
2309 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2310
2311 if (const auto *RtnAddrOp =
2312 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2313 ScoreBrackets.determineWaitForPhysReg(
2314 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2315 }
2316 }
2317 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2318 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2319 } else {
2320 // FIXME: Should not be relying on memoperands.
2321 // Look at the source operands of every instruction to see if
2322 // any of them results from a previous memory operation that affects
2323 // its current usage. If so, an s_waitcnt instruction needs to be
2324 // emitted.
2325 // If the source operand was defined by a load, add the s_waitcnt
2326 // instruction.
2327 //
2328 // Two cases are handled for destination operands:
2329 // 1) If the destination operand was defined by a load, add the s_waitcnt
2330 // instruction to guarantee the right WAW order.
2331 // 2) If a destination operand that was used by a recent export/store ins,
2332 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2333
2334 for (const MachineMemOperand *Memop : MI.memoperands()) {
2335 const Value *Ptr = Memop->getValue();
2336 if (Memop->isStore()) {
2337 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2338 addWait(Wait, SmemAccessCounter, 0);
2339 if (PDT->dominates(MI.getParent(), It->second))
2340 SLoadAddresses.erase(It);
2341 }
2342 }
2343 unsigned AS = Memop->getAddrSpace();
2345 continue;
2346 // No need to wait before load from VMEM to LDS.
2347 if (TII->mayWriteLDSThroughDMA(MI))
2348 continue;
2349
2350 // LOAD_CNT is only relevant to vgpr or LDS.
2351 unsigned TID = LDSDMA_BEGIN;
2352 if (Ptr && Memop->getAAInfo()) {
2353 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2354 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2355 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2356 if ((I + 1) >= NUM_LDSDMA) {
2357 // We didn't have enough slot to track this LDS DMA store, it
2358 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2359 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2360 break;
2361 }
2362
2363 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2364 }
2365 }
2366 } else {
2367 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2368 }
2369 if (Memop->isStore()) {
2370 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2371 }
2372 }
2373
2374 // Loop over use and def operands.
2375 for (const MachineOperand &Op : MI.operands()) {
2376 if (!Op.isReg())
2377 continue;
2378
2379 // If the instruction does not read tied source, skip the operand.
2380 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2381 continue;
2382
2383 MCPhysReg Reg = Op.getReg().asMCReg();
2384
2385 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2386 if (IsVGPR) {
2387 // Implicit VGPR defs and uses are never a part of the memory
2388 // instructions description and usually present to account for
2389 // super-register liveness.
2390 // TODO: Most of the other instructions also have implicit uses
2391 // for the liveness accounting only.
2392 if (Op.isImplicit() && MI.mayLoadOrStore())
2393 continue;
2394
2395 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2396 if (Op.isDef())
2397 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2398 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2399 // previous write and this write are the same type of VMEM
2400 // instruction, in which case they are (in some architectures)
2401 // guaranteed to write their results in order anyway.
2402 // Additionally check instructions where Point Sample Acceleration
2403 // might be applied.
2404 if (Op.isUse() || !updateVMCntOnly(MI) ||
2405 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2406 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2407 !ST->hasVmemWriteVgprInOrder()) {
2408 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2409 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2410 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2411 ScoreBrackets.clearVgprVmemTypes(Reg);
2412 }
2413
2414 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2415 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2416 }
2417 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2418 } else if (Op.getReg() == AMDGPU::SCC) {
2419 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2420 } else {
2421 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2422 }
2423
2424 if (ST->hasWaitXCnt() && Op.isDef())
2425 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2426 }
2427 }
2428 }
2429
2430 // Ensure safety against exceptions from outstanding memory operations while
2431 // waiting for a barrier:
2432 //
2433 // * Some subtargets safely handle backing off the barrier in hardware
2434 // when an exception occurs.
2435 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2436 // there can be no outstanding memory operations during the wait.
2437 // * Subtargets with split barriers don't need to back off the barrier; it
2438 // is up to the trap handler to preserve the user barrier state correctly.
2439 //
2440 // In all other cases, ensure safety by ensuring that there are no outstanding
2441 // memory operations.
2442 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2443 !ST->supportsBackOffBarrier()) {
2444 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2445 }
2446
2447 // TODO: Remove this work-around, enable the assert for Bug 457939
2448 // after fixing the scheduler. Also, the Shader Compiler code is
2449 // independent of target.
2450 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2451 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2452 Wait.DsCnt = 0;
2453 }
2454
2455 // Verify that the wait is actually needed.
2456 ScoreBrackets.simplifyWaitcnt(Wait);
2457
2458 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2459 // waits on VA_VDST if the instruction it would precede is not a VALU
2460 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2461 // expert scheduling mode.
2462 if (TII->isVALU(MI))
2463 Wait.VaVdst = ~0u;
2464
2465 // Since the translation for VMEM addresses occur in-order, we can apply the
2466 // XCnt if the current instruction is of VMEM type and has a memory
2467 // dependency with another VMEM instruction in flight.
2468 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2469 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2470 Wait.XCnt = ~0u;
2471 }
2472
2473 // When forcing emit, we need to skip terminators because that would break the
2474 // terminators of the MBB if we emit a waitcnt between terminators.
2475 if (ForceEmitZeroFlag && !MI.isTerminator())
2476 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2477
2478 if (ForceEmitWaitcnt[LOAD_CNT])
2479 Wait.LoadCnt = 0;
2480 if (ForceEmitWaitcnt[EXP_CNT])
2481 Wait.ExpCnt = 0;
2482 if (ForceEmitWaitcnt[DS_CNT])
2483 Wait.DsCnt = 0;
2484 if (ForceEmitWaitcnt[SAMPLE_CNT])
2485 Wait.SampleCnt = 0;
2486 if (ForceEmitWaitcnt[BVH_CNT])
2487 Wait.BvhCnt = 0;
2488 if (ForceEmitWaitcnt[KM_CNT])
2489 Wait.KmCnt = 0;
2490 if (ForceEmitWaitcnt[X_CNT])
2491 Wait.XCnt = 0;
2492 // Only force emit VA_VDST and VM_VSRC if expert mode is enabled.
2493 if (IsExpertMode) {
2494 if (ForceEmitWaitcnt[VA_VDST])
2495 Wait.VaVdst = 0;
2496 if (ForceEmitWaitcnt[VM_VSRC])
2497 Wait.VmVsrc = 0;
2498 }
2499
2500 if (FlushFlags.FlushVmCnt) {
2501 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2502 Wait.LoadCnt = 0;
2503 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2504 Wait.SampleCnt = 0;
2505 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2506 Wait.BvhCnt = 0;
2507 }
2508
2509 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2510 Wait.DsCnt = 0;
2511
2512 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2513 Wait.LoadCnt = 0;
2514
2515 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2516 OldWaitcntInstr);
2517}
2518
2519bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2521 MachineBasicBlock &Block,
2522 WaitcntBrackets &ScoreBrackets,
2523 MachineInstr *OldWaitcntInstr) {
2524 bool Modified = false;
2525
2526 if (OldWaitcntInstr)
2527 // Try to merge the required wait with preexisting waitcnt instructions.
2528 // Also erase redundant waitcnt.
2529 Modified =
2530 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2531
2532 AMDGPU::Waitcnt WaitForScore = Wait;
2533
2534 // ExpCnt can be merged into VINTERP.
2535 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2537 MachineOperand *WaitExp =
2538 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2539 if (Wait.ExpCnt < WaitExp->getImm()) {
2540 WaitExp->setImm(Wait.ExpCnt);
2541 Modified = true;
2542 }
2543 Wait.ExpCnt = ~0u;
2544
2545 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2546 << "Update Instr: " << *It);
2547 }
2548
2549 if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
2550 Modified = true;
2551
2552 // Any counts that could have been applied to any existing waitcnt
2553 // instructions will have been done so, now deal with any remaining.
2554 ScoreBrackets.applyWaitcnt(WaitForScore);
2555
2556 return Modified;
2557}
2558
2559std::optional<WaitEventType>
2560SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2561 if (TII->isVALU(Inst)) {
2562 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2563 // out-of-order with respect to each other, so each of these classes
2564 // has its own event.
2565
2566 if (TII->isXDL(Inst))
2567 return VGPR_XDL_WRITE;
2568
2569 if (TII->isTRANS(Inst))
2570 return VGPR_TRANS_WRITE;
2571
2573 return VGPR_DPMACC_WRITE;
2574
2575 return VGPR_CSMACC_WRITE;
2576 }
2577
2578 // FLAT and LDS instructions may read their VGPR sources out-of-order
2579 // with respect to each other and all other VMEM instructions, so
2580 // each of these also has a separate event.
2581
2582 if (TII->isFLAT(Inst))
2583 return VGPR_FLAT_READ;
2584
2585 if (TII->isDS(Inst))
2586 return VGPR_LDS_READ;
2587
2588 if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
2589 return VGPR_VMEM_READ;
2590
2591 // Otherwise, no hazard.
2592
2593 return {};
2594}
2595
2596bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2597 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2598 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2599}
2600
2601// Return true if the next instruction is S_ENDPGM, following fallthrough
2602// blocks if necessary.
2603bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2604 MachineBasicBlock *Block) const {
2605 auto BlockEnd = Block->getParent()->end();
2606 auto BlockIter = Block->getIterator();
2607
2608 while (true) {
2609 if (It.isEnd()) {
2610 if (++BlockIter != BlockEnd) {
2611 It = BlockIter->instr_begin();
2612 continue;
2613 }
2614
2615 return false;
2616 }
2617
2618 if (!It->isMetaInstruction())
2619 break;
2620
2621 It++;
2622 }
2623
2624 assert(!It.isEnd());
2625
2626 return It->getOpcode() == AMDGPU::S_ENDPGM;
2627}
2628
2629// Add a wait after an instruction if architecture requirements mandate one.
2630bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2631 MachineBasicBlock &Block,
2632 WaitcntBrackets &ScoreBrackets) {
2633 AMDGPU::Waitcnt Wait;
2634 bool NeedsEndPGMCheck = false;
2635
2636 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2637 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2639
2640 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2641 Wait.DsCnt = 0;
2642 NeedsEndPGMCheck = true;
2643 }
2644
2645 ScoreBrackets.simplifyWaitcnt(Wait);
2646
2647 auto SuccessorIt = std::next(Inst.getIterator());
2648 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2649 /*OldWaitcntInstr=*/nullptr);
2650
2651 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2652 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2653 .addImm(0);
2654 }
2655
2656 return Result;
2657}
2658
2659void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2660 WaitcntBrackets *ScoreBrackets) {
2661 // Now look at the instruction opcode. If it is a memory access
2662 // instruction, update the upper-bound of the appropriate counter's
2663 // bracket and the destination operand scores.
2664 // For architectures with X_CNT, mark the source address operands
2665 // with the appropriate counter values.
2666 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2667
2668 bool IsVMEMAccess = false;
2669 bool IsSMEMAccess = false;
2670
2671 if (IsExpertMode) {
2672 if (const auto ET = getExpertSchedulingEventType(Inst))
2673 ScoreBrackets->updateByEvent(*ET, Inst);
2674 }
2675
2676 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2677 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2678 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2679 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2680 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2681 ScoreBrackets->setPendingGDS();
2682 } else {
2683 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2684 }
2685 } else if (TII->isFLAT(Inst)) {
2687 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2688 return;
2689 }
2690
2691 assert(Inst.mayLoadOrStore());
2692
2693 int FlatASCount = 0;
2694
2695 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2696 ++FlatASCount;
2697 IsVMEMAccess = true;
2698 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2699 }
2700
2701 if (TII->mayAccessLDSThroughFlat(Inst)) {
2702 ++FlatASCount;
2703 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2704 }
2705
2706 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2707 // pointers. They do have two operands that each access global and LDS, thus
2708 // making it appear at this point that they are using a flat pointer. Filter
2709 // them out, and for the rest, generate a dependency on flat pointers so
2710 // that both VM and LGKM counters are flushed.
2711 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2712 ScoreBrackets->setPendingFlat();
2713 } else if (SIInstrInfo::isVMEM(Inst) &&
2715 IsVMEMAccess = true;
2716 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2717
2718 if (ST->vmemWriteNeedsExpWaitcnt() &&
2719 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2720 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2721 }
2722 } else if (TII->isSMRD(Inst)) {
2723 IsSMEMAccess = true;
2724 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2725 } else if (Inst.isCall()) {
2726 // Act as a wait on everything
2727 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2728 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2729 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2730 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2731 } else if (TII->isVINTERP(Inst)) {
2732 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2733 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2734 } else if (SIInstrInfo::isEXP(Inst)) {
2735 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2737 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2738 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2739 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2740 else
2741 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2742 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2743 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2744 } else {
2745 switch (Inst.getOpcode()) {
2746 case AMDGPU::S_SENDMSG:
2747 case AMDGPU::S_SENDMSG_RTN_B32:
2748 case AMDGPU::S_SENDMSG_RTN_B64:
2749 case AMDGPU::S_SENDMSGHALT:
2750 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2751 break;
2752 case AMDGPU::S_MEMTIME:
2753 case AMDGPU::S_MEMREALTIME:
2754 case AMDGPU::S_GET_BARRIER_STATE_M0:
2755 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2756 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2757 break;
2758 }
2759 }
2760
2761 if (!ST->hasWaitXCnt())
2762 return;
2763
2764 if (IsVMEMAccess)
2765 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2766
2767 if (IsSMEMAccess)
2768 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2769}
2770
2771bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2772 unsigned OtherScore) {
2773 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2774 unsigned OtherShifted =
2775 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2776 Score = std::max(MyShifted, OtherShifted);
2777 return OtherShifted > MyShifted;
2778}
2779
2780/// Merge the pending events and associater score brackets of \p Other into
2781/// this brackets status.
2782///
2783/// Returns whether the merge resulted in a change that requires tighter waits
2784/// (i.e. the merged brackets strictly dominate the original brackets).
2785bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2786 bool StrictDom = false;
2787
2788 // Check if "other" has keys we don't have, and create default entries for
2789 // those. If they remain empty after merging, we will clean it up after.
2790 for (auto K : Other.VMem.keys())
2791 VMem.try_emplace(K);
2792 for (auto K : Other.SGPRs.keys())
2793 SGPRs.try_emplace(K);
2794
2795 for (auto T : inst_counter_types(Context->MaxCounter)) {
2796 // Merge event flags for this counter
2797 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2798 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2799 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2800 if (OtherEvents & ~OldEvents)
2801 StrictDom = true;
2802 PendingEvents |= OtherEvents;
2803
2804 // Merge scores for this counter
2805 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2806 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2807 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2808 if (NewUB < ScoreLBs[T])
2809 report_fatal_error("waitcnt score overflow");
2810
2811 MergeInfo M;
2812 M.OldLB = ScoreLBs[T];
2813 M.OtherLB = Other.ScoreLBs[T];
2814 M.MyShift = NewUB - ScoreUBs[T];
2815 M.OtherShift = NewUB - Other.ScoreUBs[T];
2816
2817 ScoreUBs[T] = NewUB;
2818
2819 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2820
2821 if (T == DS_CNT)
2822 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2823
2824 if (T == KM_CNT) {
2825 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2826 if (Other.hasPendingEvent(SCC_WRITE)) {
2827 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2828 if (!OldEventsHasSCCWrite) {
2829 PendingSCCWrite = Other.PendingSCCWrite;
2830 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2831 PendingSCCWrite = nullptr;
2832 }
2833 }
2834 }
2835
2836 for (auto &[RegID, Info] : VMem)
2837 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2838
2839 if (isSmemCounter(T)) {
2840 unsigned Idx = getSgprScoresIdx(T);
2841 for (auto &[RegID, Info] : SGPRs) {
2842 auto It = Other.SGPRs.find(RegID);
2843 unsigned OtherScore =
2844 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2845 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2846 }
2847 }
2848 }
2849
2850 for (auto &[TID, Info] : VMem) {
2851 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2852 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2853 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2854 Info.VMEMTypes = NewVmemTypes;
2855 }
2856 }
2857
2858 purgeEmptyTrackingData();
2859 return StrictDom;
2860}
2861
2862static bool isWaitInstr(MachineInstr &Inst) {
2863 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2864 return Opcode == AMDGPU::S_WAITCNT ||
2865 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2866 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2867 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2868 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2869 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2870 counterTypeForInstr(Opcode).has_value();
2871}
2872
2873void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2875 bool ExpertMode) const {
2876 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2878 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
2879 .addImm(ExpertMode ? 2 : 0)
2880 .addImm(EncodedReg);
2881}
2882
2883// Generate s_waitcnt instructions where needed.
2884bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2885 MachineBasicBlock &Block,
2886 WaitcntBrackets &ScoreBrackets) {
2887 bool Modified = false;
2888
2889 LLVM_DEBUG({
2890 dbgs() << "*** Begin Block: ";
2891 Block.printName(dbgs());
2892 ScoreBrackets.dump();
2893 });
2894
2895 // Track the correctness of vccz through this basic block. There are two
2896 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2897 // ST->partialVCCWritesUpdateVCCZ().
2898 bool VCCZCorrect = true;
2899 if (ST->hasReadVCCZBug()) {
2900 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2901 // to vcc and then issued an smem load.
2902 VCCZCorrect = false;
2903 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2904 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2905 // to vcc_lo or vcc_hi.
2906 VCCZCorrect = false;
2907 }
2908
2909 // Walk over the instructions.
2910 MachineInstr *OldWaitcntInstr = nullptr;
2911
2912 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2913 E = Block.instr_end();
2914 Iter != E;) {
2915 MachineInstr &Inst = *Iter;
2916 if (Inst.isMetaInstruction()) {
2917 ++Iter;
2918 continue;
2919 }
2920
2921 // Track pre-existing waitcnts that were added in earlier iterations or by
2922 // the memory legalizer.
2923 if (isWaitInstr(Inst) ||
2924 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2925 if (!OldWaitcntInstr)
2926 OldWaitcntInstr = &Inst;
2927 ++Iter;
2928 continue;
2929 }
2930
2931 PreheaderFlushFlags FlushFlags;
2932 if (Block.getFirstTerminator() == Inst)
2933 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
2934
2935 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2936 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2937 FlushFlags);
2938 OldWaitcntInstr = nullptr;
2939
2940 // Restore vccz if it's not known to be correct already.
2941 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2942
2943 // Don't examine operands unless we need to track vccz correctness.
2944 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2945 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2946 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2947 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2948 if (!ST->partialVCCWritesUpdateVCCZ())
2949 VCCZCorrect = false;
2950 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2951 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2952 // vccz bit, so when we detect that an instruction may read from a
2953 // corrupt vccz bit, we need to:
2954 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2955 // operations to complete.
2956 // 2. Restore the correct value of vccz by writing the current value
2957 // of vcc back to vcc.
2958 if (ST->hasReadVCCZBug() &&
2959 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2960 // Writes to vcc while there's an outstanding smem read may get
2961 // clobbered as soon as any read completes.
2962 VCCZCorrect = false;
2963 } else {
2964 // Writes to vcc will fix any incorrect value in vccz.
2965 VCCZCorrect = true;
2966 }
2967 }
2968 }
2969
2970 if (TII->isSMRD(Inst)) {
2971 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2972 // No need to handle invariant loads when avoiding WAR conflicts, as
2973 // there cannot be a vector store to the same memory location.
2974 if (!Memop->isInvariant()) {
2975 const Value *Ptr = Memop->getValue();
2976 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2977 }
2978 }
2979 if (ST->hasReadVCCZBug()) {
2980 // This smem read could complete and clobber vccz at any time.
2981 VCCZCorrect = false;
2982 }
2983 }
2984
2985 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2986
2987 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2988
2989 LLVM_DEBUG({
2990 Inst.print(dbgs());
2991 ScoreBrackets.dump();
2992 });
2993
2994 // TODO: Remove this work-around after fixing the scheduler and enable the
2995 // assert above.
2996 if (RestoreVCCZ) {
2997 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2998 // bit is updated, so we can restore the bit by reading the value of
2999 // vcc and then writing it back to the register.
3000 BuildMI(Block, Inst, Inst.getDebugLoc(),
3001 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3002 TRI->getVCC())
3003 .addReg(TRI->getVCC());
3004 VCCZCorrect = true;
3005 Modified = true;
3006 }
3007
3008 ++Iter;
3009 }
3010
3011 // Flush counters at the end of the block if needed (for preheaders with no
3012 // terminator).
3013 AMDGPU::Waitcnt Wait;
3014 if (Block.getFirstTerminator() == Block.end()) {
3015 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3016 if (FlushFlags.FlushVmCnt) {
3017 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3018 Wait.LoadCnt = 0;
3019 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3020 Wait.SampleCnt = 0;
3021 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3022 Wait.BvhCnt = 0;
3023 }
3024 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3025 Wait.DsCnt = 0;
3026 }
3027
3028 // Combine or remove any redundant waitcnts at the end of the block.
3029 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3030 OldWaitcntInstr);
3031
3032 LLVM_DEBUG({
3033 dbgs() << "*** End Block: ";
3034 Block.printName(dbgs());
3035 ScoreBrackets.dump();
3036 });
3037
3038 return Modified;
3039}
3040
3041// Return flags indicating which counters should be flushed in the preheader.
3042PreheaderFlushFlags
3043SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3044 const WaitcntBrackets &ScoreBrackets) {
3045 auto [Iterator, IsInserted] =
3046 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3047 if (!IsInserted)
3048 return Iterator->second;
3049
3050 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3051 if (!Succ)
3052 return PreheaderFlushFlags();
3053
3054 MachineLoop *Loop = MLI->getLoopFor(Succ);
3055 if (!Loop)
3056 return PreheaderFlushFlags();
3057
3058 if (Loop->getLoopPreheader() == &MBB) {
3059 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3060 return Iterator->second;
3061 }
3062
3063 return PreheaderFlushFlags();
3064}
3065
3066bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3068 return TII->mayAccessVMEMThroughFlat(MI);
3069 return SIInstrInfo::isVMEM(MI);
3070}
3071
3072bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3073 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3074}
3075
3076// Check if instruction is a store to LDS that is counted via DSCNT
3077// (where that counter exists).
3078bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3079 if (!MI.mayStore())
3080 return false;
3081 if (SIInstrInfo::isDS(MI))
3082 return true;
3083 return false;
3084}
3085
3086// Return flags indicating which counters should be flushed in the preheader of
3087// the given loop. We currently decide to flush in a few situations:
3088// For VMEM (FlushVmCnt):
3089// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3090// vgpr containing a value that is loaded outside of the loop. (Only on
3091// targets with no vscnt counter).
3092// 2. The loop contains vmem load(s), but the loaded values are not used in the
3093// loop, and at least one use of a vgpr containing a value that is loaded
3094// outside of the loop.
3095// For DS (FlushDsCnt, GFX12+ only):
3096// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3097// a value that is DS loaded outside of the loop.
3098// 4. The loop contains DS read(s), loaded values are not used in the same
3099// iteration but in the next iteration (prefetch pattern), and at least one
3100// use of a vgpr containing a value that is DS loaded outside of the loop.
3101// Flushing in preheader reduces wait overhead if the wait requirement in
3102// iteration 1 would otherwise be more strict.
3103PreheaderFlushFlags
3104SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3105 const WaitcntBrackets &Brackets) {
3106 PreheaderFlushFlags Flags;
3107 bool HasVMemLoad = false;
3108 bool HasVMemStore = false;
3109 bool SeenDSStoreInLoop = false;
3110 bool UsesVgprLoadedOutsideVMEM = false;
3111 bool UsesVgprLoadedOutsideDS = false;
3112 bool VMemInvalidated = false;
3113 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3114 bool DSInvalidated = !ST->hasExtendedWaitCounts();
3115 DenseSet<MCRegUnit> VgprUse;
3116 DenseSet<MCRegUnit> VgprDefVMEM;
3117 DenseSet<MCRegUnit> VgprDefDS;
3118
3119 for (MachineBasicBlock *MBB : ML->blocks()) {
3120 bool SeenDSStoreInCurrMBB = false;
3121 for (MachineInstr &MI : *MBB) {
3122 if (isVMEMOrFlatVMEM(MI)) {
3123 HasVMemLoad |= MI.mayLoad();
3124 HasVMemStore |= MI.mayStore();
3125 }
3126 if (mayStoreIncrementingDSCNT(MI))
3127 SeenDSStoreInCurrMBB = true;
3128 // Stores postdominated by a barrier will have a wait at the barrier
3129 // and thus no need to be waited at the loop header. Barrier found
3130 // later in the same MBB during in-order traversal is used here as a
3131 // cheaper alternative to postdomination check.
3132 if (MI.getOpcode() == AMDGPU::S_BARRIER)
3133 SeenDSStoreInCurrMBB = false;
3134 for (const MachineOperand &Op : MI.all_uses()) {
3135 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
3136 continue;
3137 // Vgpr use
3138 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3139 // If we find a register that is loaded inside the loop, 1. and 2.
3140 // are invalidated.
3141 if (VgprDefVMEM.contains(RU))
3142 VMemInvalidated = true;
3143
3144 // Check for DS loads used inside the loop
3145 if (VgprDefDS.contains(RU))
3146 DSInvalidated = true;
3147
3148 // Early exit if both optimizations are invalidated
3149 if (VMemInvalidated && DSInvalidated)
3150 return Flags;
3151
3152 VgprUse.insert(RU);
3153 // Check if this register has a pending VMEM load from outside the
3154 // loop (value loaded outside and used inside).
3155 VMEMID ID = toVMEMID(RU);
3156 bool HasPendingVMEM =
3157 Brackets.getVMemScore(ID, LOAD_CNT) >
3158 Brackets.getScoreLB(LOAD_CNT) ||
3159 Brackets.getVMemScore(ID, SAMPLE_CNT) >
3160 Brackets.getScoreLB(SAMPLE_CNT) ||
3161 Brackets.getVMemScore(ID, BVH_CNT) > Brackets.getScoreLB(BVH_CNT);
3162 if (HasPendingVMEM)
3163 UsesVgprLoadedOutsideVMEM = true;
3164 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3165 // Only consider it a DS load if there's no pending VMEM load for
3166 // this register, since FLAT can set both counters.
3167 if (!HasPendingVMEM &&
3168 Brackets.getVMemScore(ID, DS_CNT) > Brackets.getScoreLB(DS_CNT))
3169 UsesVgprLoadedOutsideDS = true;
3170 }
3171 }
3172
3173 // VMem load vgpr def
3174 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3175 for (const MachineOperand &Op : MI.all_defs()) {
3176 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3177 // If we find a register that is loaded inside the loop, 1. and 2.
3178 // are invalidated.
3179 if (VgprUse.contains(RU))
3180 VMemInvalidated = true;
3181 VgprDefVMEM.insert(RU);
3182 }
3183 }
3184 // Early exit if both optimizations are invalidated
3185 if (VMemInvalidated && DSInvalidated)
3186 return Flags;
3187 }
3188
3189 // DS read vgpr def
3190 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3191 // If USE comes before DEF, it's the prefetch pattern (use value from
3192 // previous iteration, load for next iteration). We should still flush
3193 // in preheader so iteration 1 doesn't need to wait inside the loop.
3194 // Only invalidate when DEF comes before USE (same-iteration consumption,
3195 // checked above when processing uses).
3196 if (isDSRead(MI)) {
3197 for (const MachineOperand &Op : MI.all_defs()) {
3198 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3199 VgprDefDS.insert(RU);
3200 }
3201 }
3202 }
3203 }
3204 // Accumulate unprotected DS stores from this MBB
3205 SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
3206 }
3207
3208 // VMEM flush decision
3209 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3210 ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3211 (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
3212 Flags.FlushVmCnt = true;
3213
3214 // DS flush decision: flush if loop uses DS-loaded values from outside
3215 // and either has no DS reads in the loop, or DS reads whose results
3216 // are not used in the loop.
3217 // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
3218 // is LGKM_CNT which also tracks FLAT/SMEM.
3219 if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
3220 Flags.FlushDsCnt = true;
3221
3222 return Flags;
3223}
3224
3225bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3226 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3227 auto *PDT =
3228 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3229 AliasAnalysis *AA = nullptr;
3230 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3231 AA = &AAR->getAAResults();
3232
3233 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3234}
3235
3236PreservedAnalyses
3239 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3240 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3242 .getManager()
3243 .getCachedResult<AAManager>(MF.getFunction());
3244
3245 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
3246 return PreservedAnalyses::all();
3247
3250 .preserve<AAManager>();
3251}
3252
3253bool SIInsertWaitcnts::run(MachineFunction &MF) {
3254 ST = &MF.getSubtarget<GCNSubtarget>();
3255 TII = ST->getInstrInfo();
3256 TRI = &TII->getRegisterInfo();
3257 MRI = &MF.getRegInfo();
3259
3261
3262 // Initialize hardware limits first, as they're needed by the generators.
3263 Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
3264
3265 if (ST->hasExtendedWaitCounts()) {
3266 IsExpertMode = ST->hasExpertSchedulingMode() &&
3267 (ExpertSchedulingModeFlag.getNumOccurrences()
3269 : MF.getFunction()
3270 .getFnAttribute("amdgpu-expert-scheduling-mode")
3271 .getValueAsBool());
3272 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3273 WCGGFX12Plus =
3274 WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode);
3275 WCG = &WCGGFX12Plus;
3276 } else {
3277 MaxCounter = NUM_NORMAL_INST_CNTS;
3278 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
3279 WCG = &WCGPreGFX12;
3280 }
3281
3282 for (auto T : inst_counter_types())
3283 ForceEmitWaitcnt[T] = false;
3284
3285 WaitEventMaskForInst = WCG->getWaitEventMask();
3286
3287 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
3288
3289 BlockInfos.clear();
3290 bool Modified = false;
3291
3292 MachineBasicBlock &EntryBB = MF.front();
3293
3294 if (!MFI->isEntryFunction()) {
3295 // Wait for any outstanding memory operations that the input registers may
3296 // depend on. We can't track them and it's better to do the wait after the
3297 // costly call sequence.
3298
3299 // TODO: Could insert earlier and schedule more liberally with operations
3300 // that only use caller preserved registers.
3302 while (I != EntryBB.end() && I->isMetaInstruction())
3303 ++I;
3304
3305 if (ST->hasExtendedWaitCounts()) {
3306 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3307 .addImm(0);
3308 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3309 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3310 continue;
3311
3312 if (!ST->hasImageInsts() &&
3313 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3314 continue;
3315
3316 BuildMI(EntryBB, I, DebugLoc(),
3317 TII->get(instrsForExtendedCounterTypes[CT]))
3318 .addImm(0);
3319 }
3320 if (IsExpertMode) {
3321 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
3323 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3324 .addImm(Enc);
3325 }
3326 } else {
3327 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
3328 }
3329
3330 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3331 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3332 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3333
3334 Modified = true;
3335 }
3336
3337 // Keep iterating over the blocks in reverse post order, inserting and
3338 // updating s_waitcnt where needed, until a fix point is reached.
3339 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3340 BlockInfos.try_emplace(MBB);
3341
3342 std::unique_ptr<WaitcntBrackets> Brackets;
3343 bool Repeat;
3344 do {
3345 Repeat = false;
3346
3347 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3348 ++BII) {
3349 MachineBasicBlock *MBB = BII->first;
3350 BlockInfo &BI = BII->second;
3351 if (!BI.Dirty)
3352 continue;
3353
3354 if (BI.Incoming) {
3355 if (!Brackets)
3356 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3357 else
3358 *Brackets = *BI.Incoming;
3359 } else {
3360 if (!Brackets)
3361 Brackets = std::make_unique<WaitcntBrackets>(this);
3362 else
3363 *Brackets = WaitcntBrackets(this);
3364 }
3365
3366 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3367 BI.Dirty = false;
3368
3369 if (Brackets->hasPendingEvent()) {
3370 BlockInfo *MoveBracketsToSucc = nullptr;
3371 for (MachineBasicBlock *Succ : MBB->successors()) {
3372 auto *SuccBII = BlockInfos.find(Succ);
3373 BlockInfo &SuccBI = SuccBII->second;
3374 if (!SuccBI.Incoming) {
3375 SuccBI.Dirty = true;
3376 if (SuccBII <= BII) {
3377 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3378 Repeat = true;
3379 }
3380 if (!MoveBracketsToSucc) {
3381 MoveBracketsToSucc = &SuccBI;
3382 } else {
3383 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3384 }
3385 } else if (SuccBI.Incoming->merge(*Brackets)) {
3386 SuccBI.Dirty = true;
3387 if (SuccBII <= BII) {
3388 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3389 Repeat = true;
3390 }
3391 }
3392 }
3393 if (MoveBracketsToSucc)
3394 MoveBracketsToSucc->Incoming = std::move(Brackets);
3395 }
3396 }
3397 } while (Repeat);
3398
3399 if (ST->hasScalarStores()) {
3400 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3401 bool HaveScalarStores = false;
3402
3403 for (MachineBasicBlock &MBB : MF) {
3404 for (MachineInstr &MI : MBB) {
3405 if (!HaveScalarStores && TII->isScalarStore(MI))
3406 HaveScalarStores = true;
3407
3408 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3409 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3410 EndPgmBlocks.push_back(&MBB);
3411 }
3412 }
3413
3414 if (HaveScalarStores) {
3415 // If scalar writes are used, the cache must be flushed or else the next
3416 // wave to reuse the same scratch memory can be clobbered.
3417 //
3418 // Insert s_dcache_wb at wave termination points if there were any scalar
3419 // stores, and only if the cache hasn't already been flushed. This could
3420 // be improved by looking across blocks for flushes in postdominating
3421 // blocks from the stores but an explicitly requested flush is probably
3422 // very rare.
3423 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3424 bool SeenDCacheWB = false;
3425
3426 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3427 I != E; ++I) {
3428 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3429 SeenDCacheWB = true;
3430 else if (TII->isScalarStore(*I))
3431 SeenDCacheWB = false;
3432
3433 // FIXME: It would be better to insert this before a waitcnt if any.
3434 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3435 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3436 !SeenDCacheWB) {
3437 Modified = true;
3438 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3439 }
3440 }
3441 }
3442 }
3443 }
3444
3445 if (IsExpertMode) {
3446 // Enable expert scheduling on function entry. To satisfy ABI requirements
3447 // and to allow calls between function with different expert scheduling
3448 // settings, disable it around calls and before returns.
3449
3451 while (I != EntryBB.end() && I->isMetaInstruction())
3452 ++I;
3453 setSchedulingMode(EntryBB, I, true);
3454
3455 for (MachineInstr *MI : CallInsts) {
3456 MachineBasicBlock &MBB = *MI->getParent();
3457 setSchedulingMode(MBB, MI, false);
3458 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3459 }
3460
3461 for (MachineInstr *MI : ReturnInsts)
3462 setSchedulingMode(*MI->getParent(), MI, false);
3463
3464 Modified = true;
3465 }
3466
3467 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3468 // This is done in different ways depending on how the VGPRs were allocated
3469 // (i.e. whether we're in dynamic VGPR mode or not).
3470 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3471 // waveslot limited kernel runs slower with the deallocation.
3472 if (MFI->isDynamicVGPREnabled()) {
3473 for (MachineInstr *MI : ReleaseVGPRInsts) {
3474 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3475 TII->get(AMDGPU::S_ALLOC_VGPR))
3476 .addImm(0);
3477 Modified = true;
3478 }
3479 } else {
3480 if (!ReleaseVGPRInsts.empty() &&
3481 (MF.getFrameInfo().hasCalls() ||
3482 ST->getOccupancyWithNumVGPRs(
3483 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3484 /*IsDynamicVGPR=*/false) <
3486 for (MachineInstr *MI : ReleaseVGPRInsts) {
3487 if (ST->requiresNopBeforeDeallocVGPRs()) {
3488 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3489 TII->get(AMDGPU::S_NOP))
3490 .addImm(0);
3491 }
3492 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3493 TII->get(AMDGPU::S_SENDMSG))
3495 Modified = true;
3496 }
3497 }
3498 }
3499
3500 CallInsts.clear();
3501 ReturnInsts.clear();
3502 ReleaseVGPRInsts.clear();
3503 PreheadersToFlush.clear();
3504 SLoadAddresses.clear();
3505
3506 return Modified;
3507}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:864
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.