LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUWaitcntUtils.h"
28#include "GCNSubtarget.h"
32#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/Sequence.h"
40#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
78 return Limits.LoadcntMax;
79 case AMDGPU::DS_CNT:
80 return Limits.DscntMax;
81 case AMDGPU::EXP_CNT:
82 return Limits.ExpcntMax;
84 return Limits.StorecntMax;
86 return Limits.SamplecntMax;
87 case AMDGPU::BVH_CNT:
88 return Limits.BvhcntMax;
89 case AMDGPU::KM_CNT:
90 return Limits.KmcntMax;
91 case AMDGPU::X_CNT:
92 return Limits.XcntMax;
93 case AMDGPU::VA_VDST:
94 return Limits.VaVdstMax;
95 case AMDGPU::VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
164 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
165
166// clang-format off
167#define AMDGPU_EVENT_ENUM(Name) Name,
168enum WaitEventType {
170 NUM_WAIT_EVENTS
171};
172#undef AMDGPU_EVENT_ENUM
173} // namespace
174
175namespace llvm {
176template <> struct enum_iteration_traits<WaitEventType> {
177 static constexpr bool is_iterable = true;
178};
179} // namespace llvm
180
181namespace {
182
183/// Return an iterator over all events between VMEM_ACCESS (the first event)
184/// and \c MaxEvent (exclusive, default value yields an enumeration over
185/// all counters).
186auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
187 return enum_seq(VMEM_ACCESS, MaxEvent);
188}
189
190#define AMDGPU_EVENT_NAME(Name) #Name,
191static constexpr StringLiteral WaitEventTypeName[] = {
193};
194#undef AMDGPU_EVENT_NAME
195static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
196 return WaitEventTypeName[Event];
197}
198// clang-format on
199
200// Enumerate different types of result-returning VMEM operations. Although
201// s_waitcnt orders them all with a single vmcnt counter, in the absence of
202// s_waitcnt only instructions of the same VmemType are guaranteed to write
203// their results in order -- so there is no need to insert an s_waitcnt between
204// two instructions of the same type that write the same vgpr.
205enum VmemType {
206 // BUF instructions and MIMG instructions without a sampler.
207 VMEM_NOSAMPLER,
208 // MIMG instructions with a sampler.
209 VMEM_SAMPLER,
210 // BVH instructions
211 VMEM_BVH,
212 NUM_VMEM_TYPES
213};
214
215// Maps values of InstCounterType to the instruction that waits on that
216// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
217// returns true, and does not cover VA_VDST or VM_VSRC.
218static const unsigned
219 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
220 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
221 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
222 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
223 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
224 AMDGPU::S_WAIT_ASYNCCNT};
225
226// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
227// code but still need to be processed by this pass for async vmcnt tracking.
228static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
229 switch (MI.getOpcode()) {
230 case AMDGPU::ASYNCMARK:
231 case AMDGPU::WAIT_ASYNCMARK:
232 return false;
233 default:
234 return MI.isMetaInstruction();
235 }
236}
237
238static bool updateVMCntOnly(const MachineInstr &Inst) {
239 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
241}
242
243#ifndef NDEBUG
244static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
245 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
246}
247#endif // NDEBUG
248
249VmemType getVmemType(const MachineInstr &Inst) {
250 assert(updateVMCntOnly(Inst));
251 if (!SIInstrInfo::isImage(Inst))
252 return VMEM_NOSAMPLER;
253 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
254 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
255 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
256
257 if (BaseInfo->BVH)
258 return VMEM_BVH;
259
260 // We have to make an additional check for isVSAMPLE here since some
261 // instructions don't have a sampler, but are still classified as sampler
262 // instructions for the purposes of e.g. waitcnt.
263 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
264 return VMEM_SAMPLER;
265
266 return VMEM_NOSAMPLER;
267}
268
269void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
270 Wait.set(T, std::min(Wait.get(T), Count));
271}
272
274 Wait.set(T, ~0u);
275}
276
277/// A small set of events.
278class WaitEventSet {
279 unsigned Mask = 0;
280
281public:
282 WaitEventSet() = default;
283 explicit constexpr WaitEventSet(WaitEventType Event) {
284 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
285 "Not enough bits in Mask for all the events");
286 Mask |= 1 << Event;
287 }
288 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
289 for (auto &E : Events) {
290 Mask |= 1 << E;
291 }
292 }
293 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
294 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
295 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
296 bool contains(const WaitEventType &Event) const {
297 return Mask & (1 << Event);
298 }
299 /// \Returns true if this set contains all elements of \p Other.
300 bool contains(const WaitEventSet &Other) const {
301 return (~Mask & Other.Mask) == 0;
302 }
303 /// \Returns the intersection of this and \p Other.
304 WaitEventSet operator&(const WaitEventSet &Other) const {
305 auto Copy = *this;
306 Copy.Mask &= Other.Mask;
307 return Copy;
308 }
309 /// \Returns the union of this and \p Other.
310 WaitEventSet operator|(const WaitEventSet &Other) const {
311 auto Copy = *this;
312 Copy.Mask |= Other.Mask;
313 return Copy;
314 }
315 /// This set becomes the union of this and \p Other.
316 WaitEventSet &operator|=(const WaitEventSet &Other) {
317 Mask |= Other.Mask;
318 return *this;
319 }
320 /// This set becomes the intersection of this and \p Other.
321 WaitEventSet &operator&=(const WaitEventSet &Other) {
322 Mask &= Other.Mask;
323 return *this;
324 }
325 bool operator==(const WaitEventSet &Other) const {
326 return Mask == Other.Mask;
327 }
328 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
329 bool empty() const { return Mask == 0; }
330 /// \Returns true if the set contains more than one element.
331 bool twoOrMore() const { return Mask & (Mask - 1); }
332 operator bool() const { return !empty(); }
333 void print(raw_ostream &OS) const {
334 ListSeparator LS(", ");
335 for (WaitEventType Event : wait_events()) {
336 if (contains(Event))
337 OS << LS << getWaitEventTypeName(Event);
338 }
339 }
340 LLVM_DUMP_METHOD void dump() const;
341};
342
343void WaitEventSet::dump() const {
344 print(dbgs());
345 dbgs() << "\n";
346}
347
348class WaitcntBrackets;
349
350// This abstracts the logic for generating and updating S_WAIT* instructions
351// away from the analysis that determines where they are needed. This was
352// done because the set of counters and instructions for waiting on them
353// underwent a major shift with gfx12, sufficiently so that having this
354// abstraction allows the main analysis logic to be simpler than it would
355// otherwise have had to become.
356class WaitcntGenerator {
357protected:
358 const GCNSubtarget &ST;
359 const SIInstrInfo &TII;
360 AMDGPU::IsaVersion IV;
361 AMDGPU::InstCounterType MaxCounter;
362 bool OptNone;
363 bool ExpandWaitcntProfiling = false;
364 const AMDGPU::HardwareLimits &Limits;
365
366public:
367 WaitcntGenerator() = delete;
368 WaitcntGenerator(const WaitcntGenerator &) = delete;
369 WaitcntGenerator(const MachineFunction &MF,
370 AMDGPU::InstCounterType MaxCounter,
371 const AMDGPU::HardwareLimits &Limits)
372 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
373 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
374 OptNone(MF.getFunction().hasOptNone() ||
375 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
376 ExpandWaitcntProfiling(
377 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
378 Limits(Limits) {}
379
380 // Return true if the current function should be compiled with no
381 // optimization.
382 bool isOptNone() const { return OptNone; }
383
384 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
385
386 // Edits an existing sequence of wait count instructions according
387 // to an incoming Waitcnt value, which is itself updated to reflect
388 // any new wait count instructions which may need to be generated by
389 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
390 // were made.
391 //
392 // This editing will usually be merely updated operands, but it may also
393 // delete instructions if the incoming Wait value indicates they are not
394 // needed. It may also remove existing instructions for which a wait
395 // is needed if it can be determined that it is better to generate new
396 // instructions later, as can happen on gfx12.
397 virtual bool
398 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
399 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
401
402 // Transform a soft waitcnt into a normal one.
403 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
404
405 // Generates new wait count instructions according to the value of
406 // Wait, returning true if any new instructions were created.
407 // ScoreBrackets is used for profiling expansion.
408 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
410 AMDGPU::Waitcnt Wait,
411 const WaitcntBrackets &ScoreBrackets) = 0;
412
413 // Returns the WaitEventSet that corresponds to counter \p T.
414 virtual const WaitEventSet &
415 getWaitEvents(AMDGPU::InstCounterType T) const = 0;
416
417 /// \returns the counter that corresponds to event \p E.
418 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
419 for (auto T : AMDGPU::inst_counter_types()) {
420 if (getWaitEvents(T).contains(E))
421 return T;
422 }
423 llvm_unreachable("event type has no associated counter");
424 }
425
426 // Returns a new waitcnt with all counters except VScnt set to 0. If
427 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
428 // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
429 // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
430 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
431
432 virtual ~WaitcntGenerator() = default;
433};
434
435class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
436 static constexpr const WaitEventSet
437 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
438 WaitEventSet(
439 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
440 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
441 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
442 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
443 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
444 WaitEventSet(),
445 WaitEventSet(),
446 WaitEventSet(),
447 WaitEventSet(),
448 WaitEventSet(),
449 WaitEventSet()};
450
451public:
452 using WaitcntGenerator::WaitcntGenerator;
453 bool
454 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
455 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
456 MachineBasicBlock::instr_iterator It) const override;
457
458 bool createNewWaitcnt(MachineBasicBlock &Block,
460 AMDGPU::Waitcnt Wait,
461 const WaitcntBrackets &ScoreBrackets) override;
462
463 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
464 return WaitEventMaskForInstPreGFX12[T];
465 }
466
467 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
468};
469
470class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
471protected:
472 bool IsExpertMode;
473 static constexpr const WaitEventSet
474 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
475 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
476 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
477 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
478 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
479 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
480 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
481 WaitEventSet({VMEM_BVH_READ_ACCESS}),
482 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
483 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
484 WaitEventSet({ASYNC_ACCESS}),
485 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
486 VGPR_XDL_WRITE}),
487 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
488
489public:
490 WaitcntGeneratorGFX12Plus() = delete;
491 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
492 AMDGPU::InstCounterType MaxCounter,
493 const AMDGPU::HardwareLimits &Limits,
494 bool IsExpertMode)
495 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
496
497 bool
498 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
499 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
500 MachineBasicBlock::instr_iterator It) const override;
501
502 bool createNewWaitcnt(MachineBasicBlock &Block,
504 AMDGPU::Waitcnt Wait,
505 const WaitcntBrackets &ScoreBrackets) override;
506
507 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
508 return WaitEventMaskForInstGFX12Plus[T];
509 }
510
511 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
512};
513
514// Flags indicating which counters should be flushed in a loop preheader.
515struct PreheaderFlushFlags {
516 bool FlushVmCnt = false;
517 bool FlushDsCnt = false;
518};
519
520class SIInsertWaitcnts {
521 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
522 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
523 MachineLoopInfo &MLI;
524 MachinePostDominatorTree &PDT;
525 AliasAnalysis *AA = nullptr;
526 MachineFunction &MF;
527
528 struct BlockInfo {
529 std::unique_ptr<WaitcntBrackets> Incoming;
530 bool Dirty = true;
531 };
532
533 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
534
535 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
536
537 std::unique_ptr<WaitcntGenerator> WCG;
538
539 // Remember call and return instructions in the function.
540 DenseSet<MachineInstr *> CallInsts;
541 DenseSet<MachineInstr *> ReturnInsts;
542
543 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
544 // be outstanding stores but definitely no outstanding scratch stores, to help
545 // with insertion of DEALLOC_VGPRS messages.
546 DenseMap<MachineInstr *, bool> EndPgmInsts;
547
548 AMDGPU::HardwareLimits Limits;
549
550public:
551 const GCNSubtarget &ST;
552 const SIInstrInfo &TII;
553 const SIRegisterInfo &TRI;
554 const MachineRegisterInfo &MRI;
555 AMDGPU::InstCounterType SmemAccessCounter;
556 AMDGPU::InstCounterType MaxCounter;
557 bool IsExpertMode = false;
558
559 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
560 AliasAnalysis *AA, MachineFunction &MF)
561 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
562 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
563 MRI(MF.getRegInfo()) {
564 (void)ForceExpCounter;
565 (void)ForceLgkmCounter;
566 (void)ForceVMCounter;
567 }
568
569 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
570
571 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
572 const WaitcntBrackets &Brackets);
573 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
574 const WaitcntBrackets &ScoreBrackets);
575 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
576 bool isDSRead(const MachineInstr &MI) const;
577 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
578 bool run();
579
580 void setForceEmitWaitcnt() {
581// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
582// For debug builds, get the debug counter info and adjust if need be
583#ifndef NDEBUG
584 if (DebugCounter::isCounterSet(ForceExpCounter) &&
585 DebugCounter::shouldExecute(ForceExpCounter)) {
586 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
587 } else {
588 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
589 }
590
591 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
592 DebugCounter::shouldExecute(ForceLgkmCounter)) {
593 ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
594 ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
595 } else {
596 ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
597 ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
598 }
599
600 if (DebugCounter::isCounterSet(ForceVMCounter) &&
601 DebugCounter::shouldExecute(ForceVMCounter)) {
602 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
603 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
604 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
605 } else {
606 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
607 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
608 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
609 }
610
611 ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
612 ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
613#endif // NDEBUG
614 }
615
616 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
617 // instruction.
618 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
619 switch (Inst.getOpcode()) {
620 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
621 case AMDGPU::GLOBAL_INV:
622 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
623 // VGPRs
624 case AMDGPU::GLOBAL_WB:
625 case AMDGPU::GLOBAL_WBINV:
626 return VMEM_WRITE_ACCESS; // tracked using storecnt
627 default:
628 break;
629 }
630
631 // Maps VMEM access types to their corresponding WaitEventType.
632 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
633 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
634
636 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
637 // these should use VM_CNT.
638 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
639 return VMEM_ACCESS;
640 if (Inst.mayStore() &&
641 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
642 if (TII.mayAccessScratch(Inst))
643 return SCRATCH_WRITE_ACCESS;
644 return VMEM_WRITE_ACCESS;
645 }
646 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
647 return VMEM_ACCESS;
648 return VmemReadMapping[getVmemType(Inst)];
649 }
650
651 std::optional<WaitEventType>
652 getExpertSchedulingEventType(const MachineInstr &Inst) const;
653
654 bool isAsync(const MachineInstr &MI) const {
656 return false;
658 return true;
659 const MachineOperand *Async =
660 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
661 return Async && (Async->getImm());
662 }
663
664 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
665 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
666 }
667
668 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
669 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
670 }
671
672 bool shouldUpdateAsyncMark(const MachineInstr &MI,
674 if (!isAsyncLdsDmaWrite(MI))
675 return false;
677 return T == AMDGPU::ASYNC_CNT;
678 return T == AMDGPU::LOAD_CNT;
679 }
680
681 bool isVmemAccess(const MachineInstr &MI) const;
682 bool generateWaitcntInstBefore(MachineInstr &MI,
683 WaitcntBrackets &ScoreBrackets,
684 MachineInstr *OldWaitcntInstr,
685 PreheaderFlushFlags FlushFlags);
686 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
688 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
689 MachineInstr *OldWaitcntInstr);
690 /// \returns all events that correspond to \p Inst.
691 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
692 void updateEventWaitcntAfter(MachineInstr &Inst,
693 WaitcntBrackets *ScoreBrackets);
694 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
695 MachineBasicBlock *Block) const;
696 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
697 WaitcntBrackets &ScoreBrackets);
698 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
699 WaitcntBrackets &ScoreBrackets);
700 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
701 /// Legalizer. Returns true if block was modified.
702 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
703 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
704 bool ExpertMode) const;
705 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
706 return WCG->getWaitEvents(T);
707 }
708 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
709 return WCG->getCounterFromEvent(E);
710 }
711};
712
713// This objects maintains the current score brackets of each wait counter, and
714// a per-register scoreboard for each wait counter.
715//
716// We also maintain the latest score for every event type that can change the
717// waitcnt in order to know if there are multiple types of events within
718// the brackets. When multiple types of event happen in the bracket,
719// wait count may get decreased out of order, therefore we need to put in
720// "s_waitcnt 0" before use.
721class WaitcntBrackets {
722public:
723 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
724 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
725 }
726
727#ifndef NDEBUG
728 ~WaitcntBrackets() {
729 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
730 for (auto &[ID, Val] : VMem) {
731 if (Val.empty())
732 ++NumUnusedVmem;
733 }
734 for (auto &[ID, Val] : SGPRs) {
735 if (Val.empty())
736 ++NumUnusedSGPRs;
737 }
738
739 if (NumUnusedVmem || NumUnusedSGPRs) {
740 errs() << "WaitcntBracket had unused entries at destruction time: "
741 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
742 << " SGPR unused entries\n";
743 std::abort();
744 }
745 }
746#endif
747
748 bool isSmemCounter(AMDGPU::InstCounterType T) const {
749 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
750 }
751
752 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
753 return ScoreUBs[T] - ScoreLBs[T];
754 }
755
756 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
757 return getVMemScore(ID, T) > getScoreLB(T);
758 }
759
760 /// \Return true if we have no score entries for counter \p T.
761 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
762
763private:
764 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
766 return ScoreLBs[T];
767 }
768
769 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
771 return ScoreUBs[T];
772 }
773
774 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
775 return getScoreUB(T) - getScoreLB(T);
776 }
777
778 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
779 auto It = SGPRs.find(RU);
780 return It != SGPRs.end() ? It->second.get(T) : 0;
781 }
782
783 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
784 auto It = VMem.find(TID);
785 return It != VMem.end() ? It->second.Scores[T] : 0;
786 }
787
788public:
789 bool merge(const WaitcntBrackets &Other);
790
791 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
792 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
793 simplifyWaitcnt(Wait, Wait);
794 }
795 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
796 AMDGPU::Waitcnt &UpdateWait) const;
797 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
798 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
799 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
800 AMDGPU::Waitcnt &UpdateWait) const;
801 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
802 AMDGPU::Waitcnt &UpdateWait) const;
803
804 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
805 AMDGPU::Waitcnt &Wait) const;
806 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
807 AMDGPU::Waitcnt &Wait) const;
808 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
809 void tryClearSCCWriteEvent(MachineInstr *Inst);
810
811 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
812 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
813 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
814 void updateByEvent(WaitEventType E, MachineInstr &MI);
815 void recordAsyncMark(MachineInstr &MI);
816
817 bool hasPendingEvent() const { return !PendingEvents.empty(); }
818 bool hasPendingEvent(WaitEventType E) const {
819 return PendingEvents.contains(E);
820 }
821 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
822 bool HasPending = PendingEvents & Context->getWaitEvents(T);
823 assert(HasPending == !empty(T) &&
824 "Expected pending events iff scoreboard is not empty");
825 return HasPending;
826 }
827
828 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
829 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
830 // Return true if more than one bit is set in Events.
831 return Events.twoOrMore();
832 }
833
834 bool hasPendingFlat() const {
835 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
836 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
837 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
838 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
839 }
840
841 void setPendingFlat() {
842 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
843 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
844 }
845
846 bool hasPendingGDS() const {
847 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
848 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
849 }
850
851 unsigned getPendingGDSWait() const {
852 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
853 getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
854 }
855
856 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
857
858 // Return true if there might be pending writes to the vgpr-interval by VMEM
859 // instructions with types different from V.
860 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
861 for (MCRegUnit RU : regunits(Reg)) {
862 auto It = VMem.find(toVMEMID(RU));
863 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
864 return true;
865 }
866 return false;
867 }
868
869 void clearVgprVmemTypes(MCPhysReg Reg) {
870 for (MCRegUnit RU : regunits(Reg)) {
871 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
872 It->second.VMEMTypes = 0;
873 if (It->second.empty())
874 VMem.erase(It);
875 }
876 }
877 }
878
879 void setStateOnFunctionEntryOrReturn() {
880 setScoreUB(AMDGPU::STORE_CNT,
881 getScoreUB(AMDGPU::STORE_CNT) +
882 getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
883 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
884 }
885
886 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
887 return LDSDMAStores;
888 }
889
890 bool hasPointSampleAccel(const MachineInstr &MI) const;
891 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
892 MCPhysReg RU) const;
893
894 void print(raw_ostream &) const;
895 void dump() const { print(dbgs()); }
896
897 // Free up memory by removing empty entries from the DenseMap that track event
898 // scores.
899 void purgeEmptyTrackingData();
900
901private:
902 struct MergeInfo {
903 unsigned OldLB;
904 unsigned OtherLB;
905 unsigned MyShift;
906 unsigned OtherShift;
907 };
908
909 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
910
911 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
912 AMDGPU::Waitcnt &Wait) const;
913
914 static bool mergeScore(const MergeInfo &M, unsigned &Score,
915 unsigned OtherScore);
916 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
917 ArrayRef<CounterValueArray> OtherMarks);
918
920 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
921 if (!Context->TRI.isInAllocatableClass(Reg))
922 return {{}, {}};
923 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
924 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
925 if (Size == 16 && Context->ST.hasD16Writes32BitVgpr())
926 Reg = Context->TRI.get32BitRegister(Reg);
927 return Context->TRI.regunits(Reg);
928 }
929
930 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
932 ScoreLBs[T] = Val;
933 }
934
935 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
937 ScoreUBs[T] = Val;
938
939 if (T != AMDGPU::EXP_CNT)
940 return;
941
942 if (getScoreRange(AMDGPU::EXP_CNT) >
943 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
944 ScoreLBs[AMDGPU::EXP_CNT] =
945 ScoreUBs[AMDGPU::EXP_CNT] -
946 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
947 }
948
949 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
950 const SIRegisterInfo &TRI = Context->TRI;
951 if (Reg == AMDGPU::SCC) {
952 SCCScore = Val;
953 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
954 for (MCRegUnit RU : regunits(Reg))
955 VMem[toVMEMID(RU)].Scores[T] = Val;
956 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
957 for (MCRegUnit RU : regunits(Reg))
958 SGPRs[RU].get(T) = Val;
959 } else {
960 llvm_unreachable("Register cannot be tracked/unknown register!");
961 }
962 }
963
964 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
965 VMem[TID].Scores[T] = Val;
966 }
967
968 void setScoreByOperand(const MachineOperand &Op,
969 AMDGPU::InstCounterType CntTy, unsigned Val);
970
971 const SIInsertWaitcnts *Context;
972
973 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
974 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
975 WaitEventSet PendingEvents;
976 // Remember the last flat memory operation.
977 unsigned LastFlatDsCnt = 0;
978 unsigned LastFlatLoadCnt = 0;
979 // Remember the last GDS operation.
980 unsigned LastGDS = 0;
981
982 // The score tracking logic is fragmented as follows:
983 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
984 // - SGPRs: SGPR RegUnits
985 // - SCC: Non-allocatable and not general purpose: not a SGPR.
986 //
987 // For the VMem case, if the key is within the range of LDS DMA IDs,
988 // then the corresponding index into the `LDSDMAStores` vector below is:
989 // Key - LDSDMA_BEGIN - 1
990 // This is because LDSDMA_BEGIN is a generic entry and does not have an
991 // associated MachineInstr.
992 //
993 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
994
995 struct VMEMInfo {
996 // Scores for all instruction counters. Zero-initialized.
997 CounterValueArray Scores{};
998 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
999 unsigned VMEMTypes = 0;
1000
1001 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1002 };
1003
1004 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1005 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1006 class SGPRInfo {
1007 /// Either DS_CNT or KM_CNT score.
1008 unsigned ScoreDsKmCnt = 0;
1009 unsigned ScoreXCnt = 0;
1010
1011 public:
1012 unsigned get(AMDGPU::InstCounterType T) const {
1013 assert(
1014 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1015 "Invalid counter");
1016 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1017 }
1018 unsigned &get(AMDGPU::InstCounterType T) {
1019 assert(
1020 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1021 "Invalid counter");
1022 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1023 }
1024
1025 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
1026 };
1027
1028 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1029 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1030
1031 // Reg score for SCC.
1032 unsigned SCCScore = 0;
1033 // The unique instruction that has an SCC write pending, if there is one.
1034 const MachineInstr *PendingSCCWrite = nullptr;
1035
1036 // Store representative LDS DMA operations. The only useful info here is
1037 // alias info. One store is kept per unique AAInfo.
1038 SmallVector<const MachineInstr *> LDSDMAStores;
1039
1040 // State of all counters at each async mark encountered so far.
1042
1043 // But in the rare pathological case, a nest of loops that pushes marks
1044 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1045 // it to a reasonable limit. We can tune this later or potentially introduce a
1046 // user option to control the value.
1047 static constexpr unsigned MaxAsyncMarks = 16;
1048
1049 // Track the upper bound score for async operations that are not part of a
1050 // mark yet. Initialized to all zeros.
1051 CounterValueArray AsyncScore{};
1052};
1053
1054class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1055public:
1056 static char ID;
1057 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1058
1059 bool runOnMachineFunction(MachineFunction &MF) override;
1060
1061 StringRef getPassName() const override {
1062 return "SI insert wait instructions";
1063 }
1064
1065 void getAnalysisUsage(AnalysisUsage &AU) const override {
1066 AU.setPreservesCFG();
1067 AU.addRequired<MachineLoopInfoWrapperPass>();
1068 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1069 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1070 AU.addPreserved<AAResultsWrapperPass>();
1072 }
1073};
1074
1075} // end anonymous namespace
1076
1077void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1079 unsigned Score) {
1080 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1081}
1082
1083// Return true if the subtarget is one that enables Point Sample Acceleration
1084// and the MachineInstr passed in is one to which it might be applied (the
1085// hardware makes this decision based on several factors, but we can't determine
1086// this at compile time, so we have to assume it might be applied if the
1087// instruction supports it).
1088bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1089 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1090 return false;
1091
1092 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1093 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1095 return BaseInfo->PointSampleAccel;
1096}
1097
1098// Return true if the subtarget enables Point Sample Acceleration, the supplied
1099// MachineInstr is one to which it might be applied and the supplied interval is
1100// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1101// (this is the type that a point sample accelerated instruction effectively
1102// becomes)
1103bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1104 MCPhysReg Reg) const {
1105 if (!hasPointSampleAccel(MI))
1106 return false;
1107
1108 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1109}
1110
1111void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1112 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
1113 assert(T < Context->MaxCounter);
1114
1115 unsigned UB = getScoreUB(T);
1116 unsigned CurrScore = UB + 1;
1117 if (CurrScore == 0)
1118 report_fatal_error("InsertWaitcnt score wraparound");
1119 // PendingEvents and ScoreUB need to be update regardless if this event
1120 // changes the score of a register or not.
1121 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1122 PendingEvents.insert(E);
1123 setScoreUB(T, CurrScore);
1124
1125 const SIRegisterInfo &TRI = Context->TRI;
1126 const MachineRegisterInfo &MRI = Context->MRI;
1127 const SIInstrInfo &TII = Context->TII;
1128
1129 if (T == AMDGPU::EXP_CNT) {
1130 // Put score on the source vgprs. If this is a store, just use those
1131 // specific register(s).
1132 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1133 // All GDS operations must protect their address register (same as
1134 // export.)
1135 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1136 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
1137
1138 if (Inst.mayStore()) {
1139 if (const auto *Data0 =
1140 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1141 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
1142 if (const auto *Data1 =
1143 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1144 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
1145 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1146 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1147 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1148 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1149 for (const MachineOperand &Op : Inst.all_uses()) {
1150 if (TRI.isVectorRegister(MRI, Op.getReg()))
1151 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1152 }
1153 }
1154 } else if (TII.isFLAT(Inst)) {
1155 if (Inst.mayStore()) {
1156 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1157 AMDGPU::EXP_CNT, CurrScore);
1158 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1159 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1160 AMDGPU::EXP_CNT, CurrScore);
1161 }
1162 } else if (TII.isMIMG(Inst)) {
1163 if (Inst.mayStore()) {
1164 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1165 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1166 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1167 AMDGPU::EXP_CNT, CurrScore);
1168 }
1169 } else if (TII.isMTBUF(Inst)) {
1170 if (Inst.mayStore())
1171 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1172 } else if (TII.isMUBUF(Inst)) {
1173 if (Inst.mayStore()) {
1174 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1175 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1176 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1177 AMDGPU::EXP_CNT, CurrScore);
1178 }
1179 } else if (TII.isLDSDIR(Inst)) {
1180 // LDSDIR instructions attach the score to the destination.
1181 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1182 AMDGPU::EXP_CNT, CurrScore);
1183 } else {
1184 if (TII.isEXP(Inst)) {
1185 // For export the destination registers are really temps that
1186 // can be used as the actual source after export patching, so
1187 // we need to treat them like sources and set the EXP_CNT
1188 // score.
1189 for (MachineOperand &DefMO : Inst.all_defs()) {
1190 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1191 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
1192 }
1193 }
1194 }
1195 for (const MachineOperand &Op : Inst.all_uses()) {
1196 if (TRI.isVectorRegister(MRI, Op.getReg()))
1197 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1198 }
1199 }
1200 } else if (T == AMDGPU::X_CNT) {
1201 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1202 if (PendingEvents.contains(OtherEvent)) {
1203 // Hardware inserts an implicit xcnt between interleaved
1204 // SMEM and VMEM operations. So there will never be
1205 // outstanding address translations for both SMEM and
1206 // VMEM at the same time.
1207 setScoreLB(T, getScoreUB(T) - 1);
1208 PendingEvents.remove(OtherEvent);
1209 }
1210 for (const MachineOperand &Op : Inst.all_uses())
1211 setScoreByOperand(Op, T, CurrScore);
1212 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1213 // Match the score to the VGPR destination or source registers as
1214 // appropriate
1215 for (const MachineOperand &Op : Inst.operands()) {
1216 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1217 (T == AMDGPU::VM_VSRC && Op.isDef()))
1218 continue;
1219 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1220 setScoreByOperand(Op, T, CurrScore);
1221 }
1222 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1223 // Match the score to the destination registers.
1224 //
1225 // Check only explicit operands. Stores, especially spill stores, include
1226 // implicit uses and defs of their super registers which would create an
1227 // artificial dependency, while these are there only for register liveness
1228 // accounting purposes.
1229 //
1230 // Special cases where implicit register defs exists, such as M0 or VCC,
1231 // but none with memory instructions.
1232 for (const MachineOperand &Op : Inst.defs()) {
1233 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1234 T == AMDGPU::BVH_CNT) {
1235 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1236 continue;
1237 if (updateVMCntOnly(Inst)) {
1238 // updateVMCntOnly should only leave us with VGPRs
1239 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1240 // defs. That's required for a sane index into `VgprMemTypes` below
1241 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1242 VmemType V = getVmemType(Inst);
1243 unsigned char TypesMask = 1 << V;
1244 // If instruction can have Point Sample Accel applied, we have to flag
1245 // this with another potential dependency
1246 if (hasPointSampleAccel(Inst))
1247 TypesMask |= 1 << VMEM_NOSAMPLER;
1248 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1249 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1250 }
1251 }
1252 setScoreByOperand(Op, T, CurrScore);
1253 }
1254 if (Inst.mayStore() &&
1255 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1256 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1257 // written can be accessed. A load from LDS to VMEM does not need a wait.
1258 //
1259 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1260 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1261 // store. The "Slot" is the index into LDSDMAStores + 1.
1262 unsigned Slot = 0;
1263 for (const auto *MemOp : Inst.memoperands()) {
1264 if (!MemOp->isStore() ||
1265 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1266 continue;
1267 // Comparing just AA info does not guarantee memoperands are equal
1268 // in general, but this is so for LDS DMA in practice.
1269 auto AAI = MemOp->getAAInfo();
1270 // Alias scope information gives a way to definitely identify an
1271 // original memory object and practically produced in the module LDS
1272 // lowering pass. If there is no scope available we will not be able
1273 // to disambiguate LDS aliasing as after the module lowering all LDS
1274 // is squashed into a single big object.
1275 if (!AAI || !AAI.Scope)
1276 break;
1277 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1278 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1279 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1280 Slot = I + 1;
1281 break;
1282 }
1283 }
1284 }
1285 if (Slot)
1286 break;
1287 // The slot may not be valid because it can be >= NUM_LDSDMA which
1288 // means the scoreboard cannot track it. We still want to preserve the
1289 // MI in order to check alias information, though.
1290 LDSDMAStores.push_back(&Inst);
1291 Slot = LDSDMAStores.size();
1292 break;
1293 }
1294 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1295 if (Slot && Slot < NUM_LDSDMA)
1296 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1297 }
1298
1299 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1300 AsyncScore[T] = CurrScore;
1301 }
1302
1304 setRegScore(AMDGPU::SCC, T, CurrScore);
1305 PendingSCCWrite = &Inst;
1306 }
1307 }
1308}
1309
1310void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1311 // In the absence of loops, AsyncMarks can grow linearly with the program
1312 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1313 // limit every time we push a new mark, but that seems like unnecessary work
1314 // in practical cases. We do separately truncate the array when processing a
1315 // loop, which should be sufficient.
1316 AsyncMarks.push_back(AsyncScore);
1317 AsyncScore = {};
1318 LLVM_DEBUG({
1319 dbgs() << "recordAsyncMark:\n" << Inst;
1320 for (const auto &Mark : AsyncMarks) {
1321 llvm::interleaveComma(Mark, dbgs());
1322 dbgs() << '\n';
1323 }
1324 });
1325}
1326
1327void WaitcntBrackets::print(raw_ostream &OS) const {
1328 const GCNSubtarget &ST = Context->ST;
1329
1330 for (auto T : inst_counter_types(Context->MaxCounter)) {
1331 unsigned SR = getScoreRange(T);
1332 switch (T) {
1333 case AMDGPU::LOAD_CNT:
1334 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1335 << SR << "):";
1336 break;
1337 case AMDGPU::DS_CNT:
1338 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1339 << SR << "):";
1340 break;
1341 case AMDGPU::EXP_CNT:
1342 OS << " EXP_CNT(" << SR << "):";
1343 break;
1344 case AMDGPU::STORE_CNT:
1345 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1346 << SR << "):";
1347 break;
1348 case AMDGPU::SAMPLE_CNT:
1349 OS << " SAMPLE_CNT(" << SR << "):";
1350 break;
1351 case AMDGPU::BVH_CNT:
1352 OS << " BVH_CNT(" << SR << "):";
1353 break;
1354 case AMDGPU::KM_CNT:
1355 OS << " KM_CNT(" << SR << "):";
1356 break;
1357 case AMDGPU::X_CNT:
1358 OS << " X_CNT(" << SR << "):";
1359 break;
1360 case AMDGPU::ASYNC_CNT:
1361 OS << " ASYNC_CNT(" << SR << "):";
1362 break;
1363 case AMDGPU::VA_VDST:
1364 OS << " VA_VDST(" << SR << "): ";
1365 break;
1366 case AMDGPU::VM_VSRC:
1367 OS << " VM_VSRC(" << SR << "): ";
1368 break;
1369 default:
1370 OS << " UNKNOWN(" << SR << "):";
1371 break;
1372 }
1373
1374 if (SR != 0) {
1375 // Print vgpr scores.
1376 unsigned LB = getScoreLB(T);
1377
1378 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1379 sort(SortedVMEMIDs);
1380
1381 for (auto ID : SortedVMEMIDs) {
1382 unsigned RegScore = VMem.at(ID).Scores[T];
1383 if (RegScore <= LB)
1384 continue;
1385 unsigned RelScore = RegScore - LB - 1;
1386 if (ID < REGUNITS_END) {
1387 OS << ' ' << RelScore << ":vRU" << ID;
1388 } else {
1389 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1390 "Unhandled/unexpected ID value!");
1391 OS << ' ' << RelScore << ":LDSDMA" << ID;
1392 }
1393 }
1394
1395 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1396 if (isSmemCounter(T)) {
1397 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1398 sort(SortedSMEMIDs);
1399 for (auto ID : SortedSMEMIDs) {
1400 unsigned RegScore = SGPRs.at(ID).get(T);
1401 if (RegScore <= LB)
1402 continue;
1403 unsigned RelScore = RegScore - LB - 1;
1404 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1405 }
1406 }
1407
1408 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1409 OS << ' ' << SCCScore << ":scc";
1410 }
1411 OS << '\n';
1412 }
1413
1414 OS << "Pending Events: ";
1415 if (hasPendingEvent()) {
1416 ListSeparator LS;
1417 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1418 if (hasPendingEvent((WaitEventType)I)) {
1419 OS << LS << WaitEventTypeName[I];
1420 }
1421 }
1422 } else {
1423 OS << "none";
1424 }
1425 OS << '\n';
1426
1427 OS << "Async score: ";
1428 if (AsyncScore.empty())
1429 OS << "none";
1430 else
1431 llvm::interleaveComma(AsyncScore, OS);
1432 OS << '\n';
1433
1434 OS << "Async marks: " << AsyncMarks.size() << '\n';
1435
1436 for (const auto &Mark : AsyncMarks) {
1437 for (auto T : AMDGPU::inst_counter_types()) {
1438 unsigned MarkedScore = Mark[T];
1439 switch (T) {
1440 case AMDGPU::LOAD_CNT:
1441 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1442 << "_CNT: " << MarkedScore;
1443 break;
1444 case AMDGPU::DS_CNT:
1445 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1446 << "_CNT: " << MarkedScore;
1447 break;
1448 case AMDGPU::EXP_CNT:
1449 OS << " EXP_CNT: " << MarkedScore;
1450 break;
1451 case AMDGPU::STORE_CNT:
1452 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1453 << "_CNT: " << MarkedScore;
1454 break;
1455 case AMDGPU::SAMPLE_CNT:
1456 OS << " SAMPLE_CNT: " << MarkedScore;
1457 break;
1458 case AMDGPU::BVH_CNT:
1459 OS << " BVH_CNT: " << MarkedScore;
1460 break;
1461 case AMDGPU::KM_CNT:
1462 OS << " KM_CNT: " << MarkedScore;
1463 break;
1464 case AMDGPU::X_CNT:
1465 OS << " X_CNT: " << MarkedScore;
1466 break;
1467 case AMDGPU::ASYNC_CNT:
1468 OS << " ASYNC_CNT: " << MarkedScore;
1469 break;
1470 default:
1471 OS << " UNKNOWN: " << MarkedScore;
1472 break;
1473 }
1474 }
1475 OS << '\n';
1476 }
1477 OS << '\n';
1478}
1479
1480/// Simplify \p UpdateWait by removing waits that are redundant based on the
1481/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1482void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1483 AMDGPU::Waitcnt &UpdateWait) const {
1484 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1485 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1486 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1487 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1488 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1489 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1490 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1491 simplifyXcnt(CheckWait, UpdateWait);
1492 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1493 simplifyVmVsrc(CheckWait, UpdateWait);
1494 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1495}
1496
1497void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1498 unsigned &Count) const {
1499 // The number of outstanding events for this type, T, can be calculated
1500 // as (UB - LB). If the current Count is greater than or equal to the number
1501 // of outstanding events, then the wait for this counter is redundant.
1502 if (Count >= getScoreRange(T))
1503 Count = ~0u;
1504}
1505
1506void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1507 AMDGPU::InstCounterType T) const {
1508 unsigned Cnt = Wait.get(T);
1509 simplifyWaitcnt(T, Cnt);
1510 Wait.set(T, Cnt);
1511}
1512
1513void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1514 AMDGPU::Waitcnt &UpdateWait) const {
1515 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1516 // optimizations. On entry to a block with multiple predescessors, there may
1517 // be pending SMEM and VMEM events active at the same time.
1518 // In such cases, only clear one active event at a time.
1519 // TODO: Revisit xcnt optimizations for gfx1250.
1520 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1521 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1522 // zero.
1523 if (CheckWait.get(AMDGPU::KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1524 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1525 // If we have pending store we cannot optimize XCnt because we do not wait for
1526 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1527 // decremented to the same number as LOADCnt.
1528 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1529 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1530 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1531 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1532 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1533}
1534
1535void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1536 AMDGPU::Waitcnt &UpdateWait) const {
1537 // Waiting for some counters implies waiting for VM_VSRC, since an
1538 // instruction that decrements a counter on completion would have
1539 // decremented VM_VSRC once its VGPR operands had been read.
1540 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1541 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1542 CheckWait.get(AMDGPU::STORE_CNT),
1543 CheckWait.get(AMDGPU::SAMPLE_CNT),
1544 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1545 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1546 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1547}
1548
1549void WaitcntBrackets::purgeEmptyTrackingData() {
1550 for (auto &[K, V] : make_early_inc_range(VMem)) {
1551 if (V.empty())
1552 VMem.erase(K);
1553 }
1554 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1555 if (V.empty())
1556 SGPRs.erase(K);
1557 }
1558}
1559
1560void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1561 unsigned ScoreToWait,
1562 AMDGPU::Waitcnt &Wait) const {
1563 const unsigned LB = getScoreLB(T);
1564 const unsigned UB = getScoreUB(T);
1565
1566 // If the score falls within the bracket, we need a waitcnt.
1567 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1568 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1569 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1570 // If there is a pending FLAT operation, and this is a VMem or LGKM
1571 // waitcnt and the target can report early completion, then we need
1572 // to force a waitcnt 0.
1573 addWait(Wait, T, 0);
1574 } else if (counterOutOfOrder(T)) {
1575 // Counter can get decremented out-of-order when there
1576 // are multiple types event in the bracket. Also emit an s_wait counter
1577 // with a conservative value of 0 for the counter.
1578 addWait(Wait, T, 0);
1579 } else {
1580 // If a counter has been maxed out avoid overflow by waiting for
1581 // MAX(CounterType) - 1 instead.
1582 unsigned NeededWait = std::min(
1583 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1584 addWait(Wait, T, NeededWait);
1585 }
1586 }
1587}
1588
1589AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1590 LLVM_DEBUG({
1591 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1592 << ":\n";
1593 for (const auto &Mark : AsyncMarks) {
1594 llvm::interleaveComma(Mark, dbgs());
1595 dbgs() << '\n';
1596 }
1597 });
1598
1599 if (AsyncMarks.size() == MaxAsyncMarks) {
1600 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1601 // MaxAsyncMarks is linear when traversing straightline code. But we do
1602 // need to check if truncation may have occured at a merge, and adjust N
1603 // to ensure that a wait is generated.
1604 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1605 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1606 }
1607
1608 AMDGPU::Waitcnt Wait;
1609 if (AsyncMarks.size() <= N) {
1610 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1611 return Wait;
1612 }
1613
1614 size_t MarkIndex = AsyncMarks.size() - N - 1;
1615 const auto &RequiredMark = AsyncMarks[MarkIndex];
1617 determineWaitForScore(T, RequiredMark[T], Wait);
1618
1619 // Immediately remove the waited mark and all older ones
1620 // This happens BEFORE the wait is actually inserted, which is fine
1621 // because we've already extracted the wait requirements
1622 LLVM_DEBUG({
1623 dbgs() << "Removing " << (MarkIndex + 1)
1624 << " async marks after determining wait\n";
1625 });
1626 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1627
1628 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1629 return Wait;
1630}
1631
1632void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1633 MCPhysReg Reg,
1634 AMDGPU::Waitcnt &Wait) const {
1635 if (Reg == AMDGPU::SCC) {
1636 determineWaitForScore(T, SCCScore, Wait);
1637 } else {
1638 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1639 for (MCRegUnit RU : regunits(Reg))
1640 determineWaitForScore(
1641 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1642 Wait);
1643 }
1644}
1645
1646void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1647 VMEMID TID,
1648 AMDGPU::Waitcnt &Wait) const {
1649 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1650 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1651}
1652
1653void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1654 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1655 // SCC has landed
1656 if (PendingSCCWrite &&
1657 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1658 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1659 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1660 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1661 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1662 SCC_WRITE_PendingEvent) {
1663 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1664 }
1665
1666 PendingEvents.remove(SCC_WRITE_PendingEvent);
1667 PendingSCCWrite = nullptr;
1668 }
1669}
1670
1671void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1673 applyWaitcnt(Wait, T);
1674}
1675
1676void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1677 const unsigned UB = getScoreUB(T);
1678 if (Count >= UB)
1679 return;
1680 if (Count != 0) {
1681 if (counterOutOfOrder(T))
1682 return;
1683 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1684 } else {
1685 setScoreLB(T, UB);
1686 PendingEvents.remove(Context->getWaitEvents(T));
1687 }
1688
1689 if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1690 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1691 applyWaitcnt(AMDGPU::X_CNT, 0);
1692 else
1693 PendingEvents.remove(SMEM_GROUP);
1694 }
1695 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1696 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1697 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1698 applyWaitcnt(AMDGPU::X_CNT, Count);
1699 else if (Count == 0)
1700 PendingEvents.remove(VMEM_GROUP);
1701 }
1702}
1703
1704void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1706 unsigned Cnt = Wait.get(T);
1707 applyWaitcnt(T, Cnt);
1708}
1709
1710// Where there are multiple types of event in the bracket of a counter,
1711// the decrement may go out of order.
1712bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1713 // Scalar memory read always can go out of order.
1714 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1715 (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
1716 return true;
1717
1718 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1719 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1720 // out-of-order completion.
1721 if (T == AMDGPU::LOAD_CNT) {
1722 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1723 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1724 // events
1725 Events.remove(GLOBAL_INV_ACCESS);
1726 // Return true only if there are still multiple event types after removing
1727 // GLOBAL_INV
1728 return Events.twoOrMore();
1729 }
1730
1731 return hasMixedPendingEvents(T);
1732}
1733
1734INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1735 false, false)
1738INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1740
1741char SIInsertWaitcntsLegacy::ID = 0;
1742
1743char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1744
1746 return new SIInsertWaitcntsLegacy();
1747}
1748
1749static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1750 unsigned NewEnc) {
1751 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1752 assert(OpIdx >= 0);
1753
1754 MachineOperand &MO = MI.getOperand(OpIdx);
1755
1756 if (NewEnc == MO.getImm())
1757 return false;
1758
1759 MO.setImm(NewEnc);
1760 return true;
1761}
1762
1763/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1764/// and if so, which counter it is waiting on.
1765static std::optional<AMDGPU::InstCounterType>
1766counterTypeForInstr(unsigned Opcode) {
1767 switch (Opcode) {
1768 case AMDGPU::S_WAIT_LOADCNT:
1769 return AMDGPU::LOAD_CNT;
1770 case AMDGPU::S_WAIT_EXPCNT:
1771 return AMDGPU::EXP_CNT;
1772 case AMDGPU::S_WAIT_STORECNT:
1773 return AMDGPU::STORE_CNT;
1774 case AMDGPU::S_WAIT_SAMPLECNT:
1775 return AMDGPU::SAMPLE_CNT;
1776 case AMDGPU::S_WAIT_BVHCNT:
1777 return AMDGPU::BVH_CNT;
1778 case AMDGPU::S_WAIT_DSCNT:
1779 return AMDGPU::DS_CNT;
1780 case AMDGPU::S_WAIT_KMCNT:
1781 return AMDGPU::KM_CNT;
1782 case AMDGPU::S_WAIT_XCNT:
1783 return AMDGPU::X_CNT;
1784 case AMDGPU::S_WAIT_ASYNCCNT:
1785 return AMDGPU::ASYNC_CNT;
1786 default:
1787 return {};
1788 }
1789}
1790
1791bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1792 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1793 if (Opcode == Waitcnt->getOpcode())
1794 return false;
1795
1796 Waitcnt->setDesc(TII.get(Opcode));
1797 return true;
1798}
1799
1800/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1801/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1802/// from \p Wait that were added by previous passes. Currently this pass
1803/// conservatively assumes that these preexisting waits are required for
1804/// correctness.
1805bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1806 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1807 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1808 assert(isNormalMode(MaxCounter));
1809
1810 bool Modified = false;
1811 MachineInstr *WaitcntInstr = nullptr;
1812 MachineInstr *WaitcntVsCntInstr = nullptr;
1813
1814 LLVM_DEBUG({
1815 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1816 if (It.isEnd())
1817 dbgs() << "end of block\n";
1818 else
1819 dbgs() << *It;
1820 });
1821
1822 for (auto &II :
1823 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1824 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1825 if (isNonWaitcntMetaInst(II)) {
1826 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1827 continue;
1828 }
1829
1830 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1831 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1832
1833 // Update required wait count. If this is a soft waitcnt (= it was added
1834 // by an earlier pass), it may be entirely removed.
1835 if (Opcode == AMDGPU::S_WAITCNT) {
1836 unsigned IEnc = II.getOperand(0).getImm();
1837 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1838 if (TrySimplify)
1839 ScoreBrackets.simplifyWaitcnt(OldWait);
1840 Wait = Wait.combined(OldWait);
1841
1842 // Merge consecutive waitcnt of the same type by erasing multiples.
1843 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1844 II.eraseFromParent();
1845 Modified = true;
1846 } else
1847 WaitcntInstr = &II;
1848 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1849 assert(ST.hasVMemToLDSLoad());
1850 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1851 << "Before: " << Wait << '\n';);
1852 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1853 Wait);
1854 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1855
1856 // It is possible (but unlikely) that this is the only wait instruction,
1857 // in which case, we exit this loop without a WaitcntInstr to consume
1858 // `Wait`. But that works because `Wait` was passed in by reference, and
1859 // the callee eventually calls createNewWaitcnt on it. We test this
1860 // possibility in an articial MIR test since such a situation cannot be
1861 // recreated by running the memory legalizer.
1862 II.eraseFromParent();
1863 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1864 unsigned N = II.getOperand(0).getImm();
1865 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1866 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1867 Wait = Wait.combined(OldWait);
1868 } else {
1869 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1870 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1871
1872 unsigned OldVSCnt =
1873 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1874 if (TrySimplify)
1875 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1877 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1878
1879 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1880 II.eraseFromParent();
1881 Modified = true;
1882 } else
1883 WaitcntVsCntInstr = &II;
1884 }
1885 }
1886
1887 if (WaitcntInstr) {
1888 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1890 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1891
1892 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1893 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1894 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1895 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1896 Wait.set(AMDGPU::EXP_CNT, ~0u);
1897 Wait.set(AMDGPU::DS_CNT, ~0u);
1898
1899 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1900 << "New Instr at block end: "
1901 << *WaitcntInstr << '\n'
1902 : dbgs() << "applied pre-existing waitcnt\n"
1903 << "Old Instr: " << *It
1904 << "New Instr: " << *WaitcntInstr << '\n');
1905 }
1906
1907 if (WaitcntVsCntInstr) {
1908 Modified |=
1909 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1910 Wait.get(AMDGPU::STORE_CNT));
1911 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1912
1913 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1914 Wait.set(AMDGPU::STORE_CNT, ~0u);
1915
1916 LLVM_DEBUG(It.isEnd()
1917 ? dbgs() << "applied pre-existing waitcnt\n"
1918 << "New Instr at block end: " << *WaitcntVsCntInstr
1919 << '\n'
1920 : dbgs() << "applied pre-existing waitcnt\n"
1921 << "Old Instr: " << *It
1922 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1923 }
1924
1925 return Modified;
1926}
1927
1928/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1929/// required counters in \p Wait
1930bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1931 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1932 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1933 assert(isNormalMode(MaxCounter));
1934
1935 bool Modified = false;
1936 const DebugLoc &DL = Block.findDebugLoc(It);
1937
1938 // Helper to emit expanded waitcnt sequence for profiling.
1939 // Emits waitcnts from (Outstanding-1) down to Target.
1940 // The EmitWaitcnt callback emits a single waitcnt.
1941 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1942 auto EmitWaitcnt) {
1943 do {
1944 EmitWaitcnt(--Outstanding);
1945 } while (Outstanding > Target);
1946 Modified = true;
1947 };
1948
1949 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1950 // single instruction while VScnt has its own instruction.
1951 if (Wait.hasWaitExceptStoreCnt()) {
1952 // If profiling expansion is enabled, emit an expanded sequence
1953 if (ExpandWaitcntProfiling) {
1954 // Check if any of the counters to be waited on are out-of-order.
1955 // If so, fall back to normal (non-expanded) behavior since expansion
1956 // would provide misleading profiling information.
1957 bool AnyOutOfOrder = false;
1958 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1959 unsigned WaitCnt = Wait.get(CT);
1960 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1961 AnyOutOfOrder = true;
1962 break;
1963 }
1964 }
1965
1966 if (AnyOutOfOrder) {
1967 // Fall back to non-expanded wait
1968 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1969 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1970 Modified = true;
1971 } else {
1972 // All counters are in-order, safe to expand
1973 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1974 unsigned WaitCnt = Wait.get(CT);
1975 if (WaitCnt == ~0u)
1976 continue;
1977
1978 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1979 getWaitCountMax(getLimits(), CT) - 1);
1980 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1981 AMDGPU::Waitcnt W;
1982 W.set(CT, Count);
1983 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1985 });
1986 }
1987 }
1988 } else {
1989 // Normal behavior: emit single combined waitcnt
1990 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1991 [[maybe_unused]] auto SWaitInst =
1992 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1993 Modified = true;
1994
1995 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1996 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1997 dbgs() << "New Instr: " << *SWaitInst << '\n');
1998 }
1999 }
2000
2001 if (Wait.hasWaitStoreCnt()) {
2002 assert(ST.hasVscnt());
2003
2004 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
2005 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
2006 // Only expand if counter is not out-of-order
2007 unsigned Outstanding =
2008 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
2009 getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
2010 EmitExpandedWaitcnt(
2011 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
2012 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2013 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2014 .addImm(Count);
2015 });
2016 } else {
2017 [[maybe_unused]] auto SWaitInst =
2018 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2019 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2021 Modified = true;
2022
2023 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2024 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2025 dbgs() << "New Instr: " << *SWaitInst << '\n');
2026 }
2027 }
2028
2029 return Modified;
2030}
2031
2032AMDGPU::Waitcnt
2033WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2034 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
2035}
2036
2037AMDGPU::Waitcnt
2038WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2039 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
2040 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2041 ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
2042 ExpertVal);
2043}
2044
2045/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2046/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2047/// were added by previous passes. Currently this pass conservatively
2048/// assumes that these preexisting waits are required for correctness.
2049bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2050 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2051 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2052 assert(!isNormalMode(MaxCounter));
2053
2054 bool Modified = false;
2055 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2056 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2057 MachineInstr *WaitcntDepctrInstr = nullptr;
2058 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
2059
2060 LLVM_DEBUG({
2061 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2062 if (It.isEnd())
2063 dbgs() << "end of block\n";
2064 else
2065 dbgs() << *It;
2066 });
2067
2068 // Accumulate waits that should not be simplified.
2069 AMDGPU::Waitcnt RequiredWait;
2070
2071 for (auto &II :
2072 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2073 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2074 if (isNonWaitcntMetaInst(II)) {
2075 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2076 continue;
2077 }
2078
2079 // Update required wait count. If this is a soft waitcnt (= it was added
2080 // by an earlier pass), it may be entirely removed.
2081
2082 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2083 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2084
2085 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2086 // attempt to do more than that either.
2087 if (Opcode == AMDGPU::S_WAITCNT)
2088 continue;
2089
2090 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2091 unsigned OldEnc =
2092 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2093 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2094 if (TrySimplify)
2095 Wait = Wait.combined(OldWait);
2096 else
2097 RequiredWait = RequiredWait.combined(OldWait);
2098 // Keep the first wait_loadcnt, erase the rest.
2099 if (CombinedLoadDsCntInstr == nullptr) {
2100 CombinedLoadDsCntInstr = &II;
2101 } else {
2102 II.eraseFromParent();
2103 Modified = true;
2104 }
2105 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2106 unsigned OldEnc =
2107 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2108 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2109 if (TrySimplify)
2110 Wait = Wait.combined(OldWait);
2111 else
2112 RequiredWait = RequiredWait.combined(OldWait);
2113 // Keep the first wait_storecnt, erase the rest.
2114 if (CombinedStoreDsCntInstr == nullptr) {
2115 CombinedStoreDsCntInstr = &II;
2116 } else {
2117 II.eraseFromParent();
2118 Modified = true;
2119 }
2120 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2121 unsigned OldEnc =
2122 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2123 AMDGPU::Waitcnt OldWait;
2126 if (TrySimplify)
2127 ScoreBrackets.simplifyWaitcnt(OldWait);
2128 Wait = Wait.combined(OldWait);
2129 if (WaitcntDepctrInstr == nullptr) {
2130 WaitcntDepctrInstr = &II;
2131 } else {
2132 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2133 // duplicate if it is waiting on things other than VA_VDST or
2134 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2135 // VM_VSRC subfields of the operand are set to the "no wait"
2136 // values.
2137
2138 unsigned Enc =
2139 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2140 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2141 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2142
2143 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2144 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2145 Modified |= promoteSoftWaitCnt(&II);
2146 } else {
2147 II.eraseFromParent();
2148 Modified = true;
2149 }
2150 }
2151 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2152 // Architectures higher than GFX10 do not have direct loads to
2153 // LDS, so no work required here yet.
2154 II.eraseFromParent();
2155 Modified = true;
2156 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2157 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
2158 // shows up in the assembly as a comment with the original parameter N.
2159 unsigned N = II.getOperand(0).getImm();
2160 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
2161 Wait = Wait.combined(OldWait);
2162 } else {
2163 std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
2164 assert(CT.has_value());
2165 unsigned OldCnt =
2166 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2167 if (TrySimplify)
2168 addWait(Wait, CT.value(), OldCnt);
2169 else
2170 addWait(RequiredWait, CT.value(), OldCnt);
2171 // Keep the first wait of its kind, erase the rest.
2172 if (WaitInstrs[CT.value()] == nullptr) {
2173 WaitInstrs[CT.value()] = &II;
2174 } else {
2175 II.eraseFromParent();
2176 Modified = true;
2177 }
2178 }
2179 }
2180
2181 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2182 Wait = Wait.combined(RequiredWait);
2183
2184 if (CombinedLoadDsCntInstr) {
2185 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2186 // to be waited for. Otherwise, let the instruction be deleted so
2187 // the appropriate single counter wait instruction can be inserted
2188 // instead, when new S_WAIT_*CNT instructions are inserted by
2189 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2190 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2191 // the loop below that deals with single counter instructions.
2192 //
2193 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2194 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2195 // will have needed to wait for their register sources to be available
2196 // first.
2197 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2198 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2199 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2200 AMDGPU::OpName::simm16, NewEnc);
2201 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2202 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
2203 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
2204 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2205 Wait.set(AMDGPU::DS_CNT, ~0u);
2206
2207 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2208 << "New Instr at block end: "
2209 << *CombinedLoadDsCntInstr << '\n'
2210 : dbgs() << "applied pre-existing waitcnt\n"
2211 << "Old Instr: " << *It << "New Instr: "
2212 << *CombinedLoadDsCntInstr << '\n');
2213 } else {
2214 CombinedLoadDsCntInstr->eraseFromParent();
2215 Modified = true;
2216 }
2217 }
2218
2219 if (CombinedStoreDsCntInstr) {
2220 // Similarly for S_WAIT_STORECNT_DSCNT.
2221 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2222 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2223 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2224 AMDGPU::OpName::simm16, NewEnc);
2225 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2226 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2227 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2228 Wait.set(AMDGPU::STORE_CNT, ~0u);
2229 Wait.set(AMDGPU::DS_CNT, ~0u);
2230
2231 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2232 << "New Instr at block end: "
2233 << *CombinedStoreDsCntInstr << '\n'
2234 : dbgs() << "applied pre-existing waitcnt\n"
2235 << "Old Instr: " << *It << "New Instr: "
2236 << *CombinedStoreDsCntInstr << '\n');
2237 } else {
2238 CombinedStoreDsCntInstr->eraseFromParent();
2239 Modified = true;
2240 }
2241 }
2242
2243 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2244 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2245 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2246 // instructions so that createNewWaitcnt() will create new combined
2247 // instructions to replace them.
2248
2249 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2250 // This is a vector of addresses in WaitInstrs pointing to instructions
2251 // that should be removed if they are present.
2253
2254 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2255 // both) need to be waited for, ensure that there are no existing
2256 // individual wait count instructions for these.
2257
2258 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2259 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2260 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2261 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2262 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2263 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2264 }
2265
2266 for (MachineInstr **WI : WaitsToErase) {
2267 if (!*WI)
2268 continue;
2269
2270 (*WI)->eraseFromParent();
2271 *WI = nullptr;
2272 Modified = true;
2273 }
2274 }
2275
2277 if (!WaitInstrs[CT])
2278 continue;
2279
2280 unsigned NewCnt = Wait.get(CT);
2281 if (NewCnt != ~0u) {
2282 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2283 AMDGPU::OpName::simm16, NewCnt);
2284 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2285
2286 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2287 setNoWait(Wait, CT);
2288
2289 LLVM_DEBUG(It.isEnd()
2290 ? dbgs() << "applied pre-existing waitcnt\n"
2291 << "New Instr at block end: " << *WaitInstrs[CT]
2292 << '\n'
2293 : dbgs() << "applied pre-existing waitcnt\n"
2294 << "Old Instr: " << *It
2295 << "New Instr: " << *WaitInstrs[CT] << '\n');
2296 } else {
2297 WaitInstrs[CT]->eraseFromParent();
2298 Modified = true;
2299 }
2300 }
2301
2302 if (WaitcntDepctrInstr) {
2303 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2304 // subfields with the new required values.
2305 unsigned Enc =
2306 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2307 ->getImm();
2310
2311 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2312 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2313 Wait.set(AMDGPU::VA_VDST, ~0u);
2314 Wait.set(AMDGPU::VM_VSRC, ~0u);
2315
2316 // If that new encoded Depctr immediate would actually still wait
2317 // for anything, update the instruction's operand. Otherwise it can
2318 // just be deleted.
2319 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2320 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2321 AMDGPU::OpName::simm16, Enc);
2322 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2323 << "New Instr at block end: "
2324 << *WaitcntDepctrInstr << '\n'
2325 : dbgs() << "applyPreexistingWaitcnt\n"
2326 << "Old Instr: " << *It << "New Instr: "
2327 << *WaitcntDepctrInstr << '\n');
2328 } else {
2329 WaitcntDepctrInstr->eraseFromParent();
2330 Modified = true;
2331 }
2332 }
2333
2334 return Modified;
2335}
2336
2337/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2338bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2339 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2340 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2341 assert(!isNormalMode(MaxCounter));
2342
2343 bool Modified = false;
2344 const DebugLoc &DL = Block.findDebugLoc(It);
2345
2346 // Helper to emit expanded waitcnt sequence for profiling.
2347 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2348 auto EmitWaitcnt) {
2349 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2350 EmitWaitcnt(I);
2351 EmitWaitcnt(Target);
2352 Modified = true;
2353 };
2354
2355 // For GFX12+, we use separate wait instructions, which makes expansion
2356 // simpler
2357 if (ExpandWaitcntProfiling) {
2359 unsigned Count = Wait.get(CT);
2360 if (Count == ~0u)
2361 continue;
2362
2363 // Skip expansion for out-of-order counters - emit normal wait instead
2364 if (ScoreBrackets.counterOutOfOrder(CT)) {
2365 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2366 .addImm(Count);
2367 Modified = true;
2368 continue;
2369 }
2370
2371 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2372 getWaitCountMax(getLimits(), CT) - 1);
2373 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2374 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2375 .addImm(Val);
2376 });
2377 }
2378 return Modified;
2379 }
2380
2381 // Normal behavior (no expansion)
2382 // Check for opportunities to use combined wait instructions.
2383 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2384 MachineInstr *SWaitInst = nullptr;
2385
2386 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2387 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2388
2389 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2390 .addImm(Enc);
2391
2392 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2393 Wait.set(AMDGPU::DS_CNT, ~0u);
2394 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2395 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2396
2397 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2398 .addImm(Enc);
2399
2400 Wait.set(AMDGPU::STORE_CNT, ~0u);
2401 Wait.set(AMDGPU::DS_CNT, ~0u);
2402 }
2403
2404 if (SWaitInst) {
2405 Modified = true;
2406
2407 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2408 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2409 dbgs() << "New Instr: " << *SWaitInst << '\n');
2410 }
2411 }
2412
2413 // Generate an instruction for any remaining counter that needs
2414 // waiting for.
2415
2417 unsigned Count = Wait.get(CT);
2418 if (Count == ~0u)
2419 continue;
2420
2421 [[maybe_unused]] auto SWaitInst =
2422 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2423 .addImm(Count);
2424
2425 Modified = true;
2426
2427 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2428 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2429 dbgs() << "New Instr: " << *SWaitInst << '\n');
2430 }
2431
2432 if (Wait.hasWaitDepctr()) {
2433 assert(IsExpertMode);
2434 unsigned Enc =
2437
2438 [[maybe_unused]] auto SWaitInst =
2439 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2440
2441 Modified = true;
2442
2443 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2444 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2445 dbgs() << "New Instr: " << *SWaitInst << '\n');
2446 }
2447
2448 return Modified;
2449}
2450
2451/// Generate s_waitcnt instruction to be placed before cur_Inst.
2452/// Instructions of a given type are returned in order,
2453/// but instructions of different types can complete out of order.
2454/// We rely on this in-order completion
2455/// and simply assign a score to the memory access instructions.
2456/// We keep track of the active "score bracket" to determine
2457/// if an access of a memory read requires an s_waitcnt
2458/// and if so what the value of each counter is.
2459/// The "score bracket" is bound by the lower bound and upper bound
2460/// scores (*_score_LB and *_score_ub respectively).
2461/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2462/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2463/// (GFX12+ only, where DS_CNT is a separate counter).
2464bool SIInsertWaitcnts::generateWaitcntInstBefore(
2465 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2466 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2467 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2468 setForceEmitWaitcnt();
2469
2470 assert(!isNonWaitcntMetaInst(MI));
2471
2472 AMDGPU::Waitcnt Wait;
2473 const unsigned Opc = MI.getOpcode();
2474
2475 switch (Opc) {
2476 case AMDGPU::BUFFER_WBINVL1:
2477 case AMDGPU::BUFFER_WBINVL1_SC:
2478 case AMDGPU::BUFFER_WBINVL1_VOL:
2479 case AMDGPU::BUFFER_GL0_INV:
2480 case AMDGPU::BUFFER_GL1_INV: {
2481 // FIXME: This should have already been handled by the memory legalizer.
2482 // Removing this currently doesn't affect any lit tests, but we need to
2483 // verify that nothing was relying on this. The number of buffer invalidates
2484 // being handled here should not be expanded.
2485 Wait.set(AMDGPU::LOAD_CNT, 0);
2486 break;
2487 }
2488 case AMDGPU::SI_RETURN_TO_EPILOG:
2489 case AMDGPU::SI_RETURN:
2490 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2491 case AMDGPU::S_SETPC_B64_return: {
2492 // All waits must be resolved at call return.
2493 // NOTE: this could be improved with knowledge of all call sites or
2494 // with knowledge of the called routines.
2495 ReturnInsts.insert(&MI);
2496 AMDGPU::Waitcnt AllZeroWait =
2497 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2498 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2499 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2500 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2501 // no need to wait for it at function boundaries.
2502 if (ST.hasExtendedWaitCounts() &&
2503 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2504 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2505 Wait = AllZeroWait;
2506 break;
2507 }
2508 case AMDGPU::S_ENDPGM:
2509 case AMDGPU::S_ENDPGM_SAVED: {
2510 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2511 // Technically the hardware will do this on its own if we don't, but that
2512 // might cost extra cycles compared to doing it explicitly.
2513 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2514 // have to wait for outstanding VMEM stores. In this case it can be useful
2515 // to send a message to explicitly release all VGPRs before the stores have
2516 // completed, but it is only safe to do this if there are no outstanding
2517 // scratch stores.
2518 EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2519 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2520 break;
2521 }
2522 case AMDGPU::S_SENDMSG:
2523 case AMDGPU::S_SENDMSGHALT: {
2524 if (ST.hasLegacyGeometry() &&
2525 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2527 // Resolve vm waits before gs-done.
2528 Wait.set(AMDGPU::LOAD_CNT, 0);
2529 break;
2530 }
2531 [[fallthrough]];
2532 }
2533 default: {
2534
2535 // Export & GDS instructions do not read the EXEC mask until after the
2536 // export is granted (which can occur well after the instruction is issued).
2537 // The shader program must flush all EXP operations on the export-count
2538 // before overwriting the EXEC mask.
2539 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2540 // Export and GDS are tracked individually, either may trigger a waitcnt
2541 // for EXEC.
2542 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2543 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2544 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2545 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2546 Wait.set(AMDGPU::EXP_CNT, 0);
2547 }
2548 }
2549
2550 // Wait for any pending GDS instruction to complete before any
2551 // "Always GDS" instruction.
2552 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2553 addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2554
2555 if (MI.isCall()) {
2556 // The function is going to insert a wait on everything in its prolog.
2557 // This still needs to be careful if the call target is a load (e.g. a GOT
2558 // load). We also need to check WAW dependency with saved PC.
2559 CallInsts.insert(&MI);
2560 Wait = AMDGPU::Waitcnt();
2561
2562 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2563 if (CallAddrOp.isReg()) {
2564 ScoreBrackets.determineWaitForPhysReg(
2565 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2566
2567 if (const auto *RtnAddrOp =
2568 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2569 ScoreBrackets.determineWaitForPhysReg(
2570 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2571 }
2572 }
2573 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2574 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2575 } else {
2576 // FIXME: Should not be relying on memoperands.
2577 // Look at the source operands of every instruction to see if
2578 // any of them results from a previous memory operation that affects
2579 // its current usage. If so, an s_waitcnt instruction needs to be
2580 // emitted.
2581 // If the source operand was defined by a load, add the s_waitcnt
2582 // instruction.
2583 //
2584 // Two cases are handled for destination operands:
2585 // 1) If the destination operand was defined by a load, add the s_waitcnt
2586 // instruction to guarantee the right WAW order.
2587 // 2) If a destination operand that was used by a recent export/store ins,
2588 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2589
2590 for (const MachineMemOperand *Memop : MI.memoperands()) {
2591 const Value *Ptr = Memop->getValue();
2592 if (Memop->isStore()) {
2593 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2594 addWait(Wait, SmemAccessCounter, 0);
2595 if (PDT.dominates(MI.getParent(), It->second))
2596 SLoadAddresses.erase(It);
2597 }
2598 }
2599 unsigned AS = Memop->getAddrSpace();
2601 continue;
2602 // No need to wait before load from VMEM to LDS.
2603 if (TII.mayWriteLDSThroughDMA(MI))
2604 continue;
2605
2606 // LOAD_CNT is only relevant to vgpr or LDS.
2607 unsigned TID = LDSDMA_BEGIN;
2608 if (Ptr && Memop->getAAInfo()) {
2609 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2610 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2611 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2612 if ((I + 1) >= NUM_LDSDMA) {
2613 // We didn't have enough slot to track this LDS DMA store, it
2614 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2615 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2616 Wait);
2617 break;
2618 }
2619
2620 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2621 TID + I + 1, Wait);
2622 }
2623 }
2624 } else {
2625 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2626 }
2627 if (Memop->isStore()) {
2628 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2629 }
2630 }
2631
2632 // Loop over use and def operands.
2633 for (const MachineOperand &Op : MI.operands()) {
2634 if (!Op.isReg())
2635 continue;
2636
2637 // If the instruction does not read tied source, skip the operand.
2638 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2639 continue;
2640
2641 MCPhysReg Reg = Op.getReg().asMCReg();
2642
2643 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2644 if (IsVGPR) {
2645 // Implicit VGPR defs and uses are never a part of the memory
2646 // instructions description and usually present to account for
2647 // super-register liveness.
2648 // TODO: Most of the other instructions also have implicit uses
2649 // for the liveness accounting only.
2650 if (Op.isImplicit() && MI.mayLoadOrStore())
2651 continue;
2652
2653 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait);
2654 if (Op.isDef())
2655 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait);
2656 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2657 // previous write and this write are the same type of VMEM
2658 // instruction, in which case they are (in some architectures)
2659 // guaranteed to write their results in order anyway.
2660 // Additionally check instructions where Point Sample Acceleration
2661 // might be applied.
2662 if (Op.isUse() || !updateVMCntOnly(MI) ||
2663 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2664 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2665 !ST.hasVmemWriteVgprInOrder()) {
2666 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait);
2667 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg,
2668 Wait);
2669 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait);
2670 ScoreBrackets.clearVgprVmemTypes(Reg);
2671 }
2672
2673 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2674 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait);
2675 }
2676 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait);
2677 } else if (Op.getReg() == AMDGPU::SCC) {
2678 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait);
2679 } else {
2680 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2681 }
2682
2683 if (ST.hasWaitXcnt() && Op.isDef())
2684 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait);
2685 }
2686 }
2687 }
2688 }
2689
2690 // Ensure safety against exceptions from outstanding memory operations while
2691 // waiting for a barrier:
2692 //
2693 // * Some subtargets safely handle backing off the barrier in hardware
2694 // when an exception occurs.
2695 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2696 // there can be no outstanding memory operations during the wait.
2697 // * Subtargets with split barriers don't need to back off the barrier; it
2698 // is up to the trap handler to preserve the user barrier state correctly.
2699 //
2700 // In all other cases, ensure safety by ensuring that there are no outstanding
2701 // memory operations.
2702 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2703 !ST.hasBackOffBarrier()) {
2704 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2705 }
2706
2707 // TODO: Remove this work-around, enable the assert for Bug 457939
2708 // after fixing the scheduler. Also, the Shader Compiler code is
2709 // independent of target.
2710 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2711 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2712 Wait.set(AMDGPU::DS_CNT, 0);
2713 }
2714
2715 // Verify that the wait is actually needed.
2716 ScoreBrackets.simplifyWaitcnt(Wait);
2717
2718 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2719 // waits on VA_VDST if the instruction it would precede is not a VALU
2720 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2721 // expert scheduling mode.
2722 if (TII.isVALU(MI))
2723 Wait.set(AMDGPU::VA_VDST, ~0u);
2724
2725 // Since the translation for VMEM addresses occur in-order, we can apply the
2726 // XCnt if the current instruction is of VMEM type and has a memory
2727 // dependency with another VMEM instruction in flight.
2728 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2729 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2730 Wait.set(AMDGPU::X_CNT, ~0u);
2731 }
2732
2733 // When forcing emit, we need to skip terminators because that would break the
2734 // terminators of the MBB if we emit a waitcnt between terminators.
2735 if (ForceEmitZeroFlag && !MI.isTerminator())
2736 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2737
2738 // If we force waitcnt then update Wait accordingly.
2740 if (!ForceEmitWaitcnt[T])
2741 continue;
2742 Wait.set(T, 0);
2743 }
2744
2745 if (FlushFlags.FlushVmCnt) {
2748 Wait.set(T, 0);
2749 }
2750
2751 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2752 Wait.set(AMDGPU::DS_CNT, 0);
2753
2754 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2755 Wait.set(AMDGPU::LOAD_CNT, 0);
2756
2757 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2758 OldWaitcntInstr);
2759}
2760
2761bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2763 MachineBasicBlock &Block,
2764 WaitcntBrackets &ScoreBrackets,
2765 MachineInstr *OldWaitcntInstr) {
2766 bool Modified = false;
2767
2768 if (OldWaitcntInstr)
2769 // Try to merge the required wait with preexisting waitcnt instructions.
2770 // Also erase redundant waitcnt.
2771 Modified =
2772 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2773
2774 // ExpCnt can be merged into VINTERP.
2775 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2777 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2778 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2779 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2780 Modified = true;
2781 }
2782 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2783 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2784 Wait.set(AMDGPU::EXP_CNT, ~0u);
2785
2786 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2787 << "Update Instr: " << *It);
2788 }
2789
2790 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2791 Modified = true;
2792
2793 // Any counts that could have been applied to any existing waitcnt
2794 // instructions will have been done so, now deal with any remaining.
2795 ScoreBrackets.applyWaitcnt(Wait);
2796
2797 return Modified;
2798}
2799
2800std::optional<WaitEventType>
2801SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2802 if (TII.isVALU(Inst)) {
2803 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2804 // out-of-order with respect to each other, so each of these classes
2805 // has its own event.
2806
2807 if (TII.isXDL(Inst))
2808 return VGPR_XDL_WRITE;
2809
2810 if (TII.isTRANS(Inst))
2811 return VGPR_TRANS_WRITE;
2812
2814 return VGPR_DPMACC_WRITE;
2815
2816 return VGPR_CSMACC_WRITE;
2817 }
2818
2819 // FLAT and LDS instructions may read their VGPR sources out-of-order
2820 // with respect to each other and all other VMEM instructions, so
2821 // each of these also has a separate event.
2822
2823 if (TII.isFLAT(Inst))
2824 return VGPR_FLAT_READ;
2825
2826 if (TII.isDS(Inst))
2827 return VGPR_LDS_READ;
2828
2829 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2830 return VGPR_VMEM_READ;
2831
2832 // Otherwise, no hazard.
2833
2834 return {};
2835}
2836
2837bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2838 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2839 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2840}
2841
2842// Return true if the next instruction is S_ENDPGM, following fallthrough
2843// blocks if necessary.
2844bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2845 MachineBasicBlock *Block) const {
2846 auto BlockEnd = Block->getParent()->end();
2847 auto BlockIter = Block->getIterator();
2848
2849 while (true) {
2850 if (It.isEnd()) {
2851 if (++BlockIter != BlockEnd) {
2852 It = BlockIter->instr_begin();
2853 continue;
2854 }
2855
2856 return false;
2857 }
2858
2859 if (!It->isMetaInstruction())
2860 break;
2861
2862 It++;
2863 }
2864
2865 assert(!It.isEnd());
2866
2867 return It->getOpcode() == AMDGPU::S_ENDPGM;
2868}
2869
2870// Add a wait after an instruction if architecture requirements mandate one.
2871bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2872 MachineBasicBlock &Block,
2873 WaitcntBrackets &ScoreBrackets) {
2874 AMDGPU::Waitcnt Wait;
2875 bool NeedsEndPGMCheck = false;
2876
2877 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2878 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2880
2881 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2882 Wait.set(AMDGPU::DS_CNT, 0);
2883 NeedsEndPGMCheck = true;
2884 }
2885
2886 ScoreBrackets.simplifyWaitcnt(Wait);
2887
2888 auto SuccessorIt = std::next(Inst.getIterator());
2889 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2890 /*OldWaitcntInstr=*/nullptr);
2891
2892 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2893 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2894 .addImm(0);
2895 }
2896
2897 return Result;
2898}
2899
2900WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2901 WaitEventSet Events;
2902 if (IsExpertMode) {
2903 if (const auto ET = getExpertSchedulingEventType(Inst))
2904 Events.insert(*ET);
2905 }
2906
2907 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2908 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2909 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2910 Events.insert(GDS_ACCESS);
2911 Events.insert(GDS_GPR_LOCK);
2912 } else {
2913 Events.insert(LDS_ACCESS);
2914 }
2915 } else if (TII.isFLAT(Inst)) {
2917 Events.insert(getVmemWaitEventType(Inst));
2918 } else {
2919 assert(Inst.mayLoadOrStore());
2920 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2921 if (ST.hasWaitXcnt())
2922 Events.insert(VMEM_GROUP);
2923 Events.insert(getVmemWaitEventType(Inst));
2924 }
2925 if (TII.mayAccessLDSThroughFlat(Inst))
2926 Events.insert(LDS_ACCESS);
2927 }
2928 } else if (SIInstrInfo::isVMEM(Inst) &&
2930 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2931 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2932 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2933 // completed.
2934 if (ST.hasWaitXcnt())
2935 Events.insert(VMEM_GROUP);
2936 Events.insert(getVmemWaitEventType(Inst));
2937 if (ST.vmemWriteNeedsExpWaitcnt() &&
2938 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2939 Events.insert(VMW_GPR_LOCK);
2940 }
2941 } else if (TII.isSMRD(Inst)) {
2942 if (ST.hasWaitXcnt())
2943 Events.insert(SMEM_GROUP);
2944 Events.insert(SMEM_ACCESS);
2945 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2946 Events.insert(EXP_LDS_ACCESS);
2947 } else if (SIInstrInfo::isEXP(Inst)) {
2948 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2950 Events.insert(EXP_PARAM_ACCESS);
2951 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2952 Events.insert(EXP_POS_ACCESS);
2953 else
2954 Events.insert(EXP_GPR_LOCK);
2955 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2956 Events.insert(SCC_WRITE);
2957 } else {
2958 switch (Inst.getOpcode()) {
2959 case AMDGPU::S_SENDMSG:
2960 case AMDGPU::S_SENDMSG_RTN_B32:
2961 case AMDGPU::S_SENDMSG_RTN_B64:
2962 case AMDGPU::S_SENDMSGHALT:
2963 Events.insert(SQ_MESSAGE);
2964 break;
2965 case AMDGPU::S_MEMTIME:
2966 case AMDGPU::S_MEMREALTIME:
2967 case AMDGPU::S_GET_BARRIER_STATE_M0:
2968 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2969 Events.insert(SMEM_ACCESS);
2970 break;
2971 }
2972 }
2973 return Events;
2974}
2975
2976void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2977 WaitcntBrackets *ScoreBrackets) {
2978
2979 WaitEventSet InstEvents = getEventsFor(Inst);
2980 for (WaitEventType E : wait_events()) {
2981 if (InstEvents.contains(E))
2982 ScoreBrackets->updateByEvent(E, Inst);
2983 }
2984
2985 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2986 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2987 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2988 ScoreBrackets->setPendingGDS();
2989 }
2990 } else if (TII.isFLAT(Inst)) {
2991 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
2992 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
2993 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2994 // pointers. They do have two operands that each access global and LDS,
2995 // thus making it appear at this point that they are using a flat pointer.
2996 // Filter them out, and for the rest, generate a dependency on flat
2997 // pointers so that both VM and LGKM counters are flushed.
2998 ScoreBrackets->setPendingFlat();
2999 }
3000 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
3001 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3002 }
3003 } else if (Inst.isCall()) {
3004 // Act as a wait on everything, but AsyncCnt is never included in such
3005 // blanket waits.
3006 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
3007 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3008 } else if (TII.isVINTERP(Inst)) {
3009 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3010 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
3011 }
3012}
3013
3014bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
3015 unsigned OtherScore) {
3016 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
3017 unsigned OtherShifted =
3018 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
3019 Score = std::max(MyShifted, OtherShifted);
3020 return OtherShifted > MyShifted;
3021}
3022
3023bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
3024 ArrayRef<CounterValueArray> OtherMarks) {
3025 bool StrictDom = false;
3026
3027 LLVM_DEBUG(dbgs() << "Merging async marks ...");
3028 // Early exit: nothing to merge when both sides are empty.
3029 if (AsyncMarks.empty() && OtherMarks.empty()) {
3030 LLVM_DEBUG(dbgs() << " nothing to merge\n");
3031 return false;
3032 }
3033 LLVM_DEBUG(dbgs() << '\n');
3034
3035 // Determine maximum length needed after merging
3036 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
3037 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3038
3039 // Keep only the most recent marks within our limit.
3040 if (AsyncMarks.size() > MaxSize)
3041 AsyncMarks.erase(AsyncMarks.begin(),
3042 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3043
3044 // Pad with zero-filled marks if our list is shorter. Zero represents "no
3045 // pending async operations at this checkpoint" and acts as the identity
3046 // element for max() during merging. We pad at the beginning since the marks
3047 // need to be aligned in most-recent order.
3048 constexpr CounterValueArray ZeroMark{};
3049 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3050
3051 LLVM_DEBUG({
3052 dbgs() << "Before merge:\n";
3053 for (const auto &Mark : AsyncMarks) {
3054 llvm::interleaveComma(Mark, dbgs());
3055 dbgs() << '\n';
3056 }
3057 dbgs() << "Other marks:\n";
3058 for (const auto &Mark : OtherMarks) {
3059 llvm::interleaveComma(Mark, dbgs());
3060 dbgs() << '\n';
3061 }
3062 });
3063
3064 // Merge element-wise using the existing mergeScore function and the
3065 // appropriate MergeInfo for each counter type. Iterate only while we have
3066 // elements in both vectors.
3067 unsigned OtherSize = OtherMarks.size();
3068 unsigned OurSize = AsyncMarks.size();
3069 unsigned MergeCount = std::min(OtherSize, OurSize);
3070 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
3071 // Our existing marks are the conservative result; return early to avoid
3072 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
3073 if (MergeCount == 0)
3074 return StrictDom;
3075 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3076 for (auto T : inst_counter_types(Context->MaxCounter)) {
3077 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3078 OtherMarks[OtherSize - Idx][T]);
3079 }
3080 }
3081
3082 LLVM_DEBUG({
3083 dbgs() << "After merge:\n";
3084 for (const auto &Mark : AsyncMarks) {
3085 llvm::interleaveComma(Mark, dbgs());
3086 dbgs() << '\n';
3087 }
3088 });
3089
3090 return StrictDom;
3091}
3092
3093/// Merge the pending events and associater score brackets of \p Other into
3094/// this brackets status.
3095///
3096/// Returns whether the merge resulted in a change that requires tighter waits
3097/// (i.e. the merged brackets strictly dominate the original brackets).
3098bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3099 bool StrictDom = false;
3100
3101 // Check if "other" has keys we don't have, and create default entries for
3102 // those. If they remain empty after merging, we will clean it up after.
3103 for (auto K : Other.VMem.keys())
3104 VMem.try_emplace(K);
3105 for (auto K : Other.SGPRs.keys())
3106 SGPRs.try_emplace(K);
3107
3108 // Array to store MergeInfo for each counter type
3109 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
3110
3111 for (auto T : inst_counter_types(Context->MaxCounter)) {
3112 // Merge event flags for this counter
3113 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3114 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3115 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3116 if (!OldEvents.contains(OtherEvents))
3117 StrictDom = true;
3118 PendingEvents |= OtherEvents;
3119
3120 // Merge scores for this counter
3121 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3122 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3123 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3124 if (NewUB < ScoreLBs[T])
3125 report_fatal_error("waitcnt score overflow");
3126
3127 MergeInfo &M = MergeInfos[T];
3128 M.OldLB = ScoreLBs[T];
3129 M.OtherLB = Other.ScoreLBs[T];
3130 M.MyShift = NewUB - ScoreUBs[T];
3131 M.OtherShift = NewUB - Other.ScoreUBs[T];
3132
3133 ScoreUBs[T] = NewUB;
3134
3135 if (T == AMDGPU::LOAD_CNT)
3136 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3137
3138 if (T == AMDGPU::DS_CNT) {
3139 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3140 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3141 }
3142
3143 if (T == AMDGPU::KM_CNT) {
3144 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3145 if (Other.hasPendingEvent(SCC_WRITE)) {
3146 if (!OldEvents.contains(SCC_WRITE)) {
3147 PendingSCCWrite = Other.PendingSCCWrite;
3148 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3149 PendingSCCWrite = nullptr;
3150 }
3151 }
3152 }
3153
3154 for (auto &[RegID, Info] : VMem)
3155 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3156
3157 if (isSmemCounter(T)) {
3158 for (auto &[RegID, Info] : SGPRs) {
3159 auto It = Other.SGPRs.find(RegID);
3160 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3161 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3162 }
3163 }
3164 }
3165
3166 for (auto &[TID, Info] : VMem) {
3167 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3168 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3169 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3170 Info.VMEMTypes = NewVmemTypes;
3171 }
3172 }
3173
3174 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3175 for (auto T : inst_counter_types(Context->MaxCounter))
3176 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3177
3178 purgeEmptyTrackingData();
3179 return StrictDom;
3180}
3181
3182static bool isWaitInstr(MachineInstr &Inst) {
3183 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3184 return Opcode == AMDGPU::S_WAITCNT ||
3185 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3186 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3187 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3188 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3189 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3190 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3191 counterTypeForInstr(Opcode).has_value();
3192}
3193
3194void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3196 bool ExpertMode) const {
3197 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3199 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3200 .addImm(ExpertMode ? 2 : 0)
3201 .addImm(EncodedReg);
3202}
3203
3204namespace {
3205// TODO: Remove this work-around after fixing the scheduler.
3206// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3207// and ST.partialVCCWritesUpdateVCCZ().
3208// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3209// corrupt vccz bit, so when we detect that an instruction may read from
3210// a corrupt vccz bit, we need to:
3211// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3212// operations to complete.
3213// 2. Recompute the correct value of vccz by writing the current value
3214// of vcc back to vcc.
3215// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3216// correct value of vccz by reading vcc and writing it back to vcc.
3217// No waitcnt is needed in this case.
3218class VCCZWorkaround {
3219 const WaitcntBrackets &ScoreBrackets;
3220 const GCNSubtarget &ST;
3221 const SIInstrInfo &TII;
3222 const SIRegisterInfo &TRI;
3223 bool VCCZCorruptionBug = false;
3224 bool VCCZNotUpdatedByPartialWrites = false;
3225 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3226 /// to vcc and then issued an smem load, so initialize to true.
3227 bool MustRecomputeVCCZ = true;
3228
3229public:
3230 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3231 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3232 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3233 VCCZCorruptionBug = ST.hasReadVCCZBug();
3234 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3235 }
3236 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3237 /// then emit a vccz recompute instruction before \p MI. This needs to be
3238 /// called on every instruction in the basic block because it also tracks the
3239 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3240 /// modified the IR.
3241 bool tryRecomputeVCCZ(MachineInstr &MI) {
3242 // No need to run this if neither bug is present.
3243 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3244 return false;
3245
3246 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3247 // both to emit a waitcnt and to recompute vccz.
3248 // But we don't actually emit a waitcnt here. This is done in
3249 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3250 // state, and can either skip emitting a waitcnt if there is already one in
3251 // the IR, or emit an "optimized" combined waitcnt.
3252 // If this is an smem read, it could complete and clobber vccz at any time.
3253 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3254
3255 // If the target partial vcc writes don't update vccz, and MI is such an
3256 // instruction then we must recompute vccz.
3257 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3258 // `definesRegister()` more than needed, because it's not very cheap.
3259 std::optional<bool> PartiallyWritesToVCCOpt;
3260 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3261 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3262 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3263 };
3264 if (VCCZNotUpdatedByPartialWrites) {
3265 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3266 // If this is a partial VCC write but won't update vccz, then we must
3267 // recompute vccz.
3268 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3269 }
3270
3271 // If MI is a vcc write with no pending smem, or there is a pending smem
3272 // but the target does not suffer from the vccz corruption bug, then we
3273 // don't need to recompute vccz as this write will recompute it anyway.
3274 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3275 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3276 if (!PartiallyWritesToVCCOpt)
3277 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3278 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3279 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3280 // If we write to the full vcc or we write partially and the target
3281 // updates vccz on partial writes, then vccz will be updated correctly.
3282 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3283 *PartiallyWritesToVCCOpt);
3284 if (UpdatesVCCZ)
3285 MustRecomputeVCCZ = false;
3286 }
3287
3288 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3289 // restore instruction if either is needed.
3290 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3291 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3292 // bit is updated, so we can restore the bit by reading the value of vcc
3293 // and then writing it back to the register.
3294 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3295 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3296 TRI.getVCC())
3297 .addReg(TRI.getVCC());
3298 MustRecomputeVCCZ = false;
3299 return true;
3300 }
3301 return false;
3302 }
3303};
3304
3305} // namespace
3306
3307// Generate s_waitcnt instructions where needed.
3308bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3309 MachineBasicBlock &Block,
3310 WaitcntBrackets &ScoreBrackets) {
3311 bool Modified = false;
3312
3313 LLVM_DEBUG({
3314 dbgs() << "*** Begin Block: ";
3315 Block.printName(dbgs());
3316 ScoreBrackets.dump();
3317 });
3318 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3319
3320 // Walk over the instructions.
3321 MachineInstr *OldWaitcntInstr = nullptr;
3322
3323 // NOTE: We may append instrs after Inst while iterating.
3324 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3325 E = Block.instr_end();
3326 Iter != E; ++Iter) {
3327 MachineInstr &Inst = *Iter;
3328 if (isNonWaitcntMetaInst(Inst))
3329 continue;
3330 // Track pre-existing waitcnts that were added in earlier iterations or by
3331 // the memory legalizer.
3332 if (isWaitInstr(Inst) ||
3333 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3334 if (!OldWaitcntInstr)
3335 OldWaitcntInstr = &Inst;
3336 continue;
3337 }
3338
3339 PreheaderFlushFlags FlushFlags;
3340 if (Block.getFirstTerminator() == Inst)
3341 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3342
3343 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3344 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3345 FlushFlags);
3346 OldWaitcntInstr = nullptr;
3347
3348 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3349 // Asyncmarks record the current wait state and so should not allow
3350 // waitcnts that occur after them to be merged into waitcnts that occur
3351 // before.
3352 ScoreBrackets.recordAsyncMark(Inst);
3353 continue;
3354 }
3355
3356 if (TII.isSMRD(Inst)) {
3357 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3358 // No need to handle invariant loads when avoiding WAR conflicts, as
3359 // there cannot be a vector store to the same memory location.
3360 if (!Memop->isInvariant()) {
3361 const Value *Ptr = Memop->getValue();
3362 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3363 }
3364 }
3365 }
3366
3367 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3368
3369 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3370 // visited by the loop.
3371 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3372
3373 LLVM_DEBUG({
3374 Inst.print(dbgs());
3375 ScoreBrackets.dump();
3376 });
3377
3378 // If the target suffers from the vccz bugs, this may emit the necessary
3379 // vccz recompute instruction before \p Inst if needed.
3380 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3381 }
3382
3383 // Flush counters at the end of the block if needed (for preheaders with no
3384 // terminator).
3385 AMDGPU::Waitcnt Wait;
3386 if (Block.getFirstTerminator() == Block.end()) {
3387 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3388 if (FlushFlags.FlushVmCnt) {
3389 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3390 Wait.set(AMDGPU::LOAD_CNT, 0);
3391 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3392 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3393 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3394 Wait.set(AMDGPU::BVH_CNT, 0);
3395 }
3396 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3397 Wait.set(AMDGPU::DS_CNT, 0);
3398 }
3399
3400 // Combine or remove any redundant waitcnts at the end of the block.
3401 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3402 OldWaitcntInstr);
3403
3404 LLVM_DEBUG({
3405 dbgs() << "*** End Block: ";
3406 Block.printName(dbgs());
3407 ScoreBrackets.dump();
3408 });
3409
3410 return Modified;
3411}
3412
3413bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3414 if (Block.size() <= 1)
3415 return false;
3416 // The Memory Legalizer conservatively inserts a soft xcnt before each
3417 // atomic RMW operation. However, for sequences of back-to-back atomic
3418 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3419 // the redundant soft xcnts.
3420 bool Modified = false;
3421 // Remember the last atomic with a soft xcnt right before it.
3422 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3423
3424 for (MachineInstr &MI : drop_begin(Block)) {
3425 // Ignore last atomic if non-LDS VMEM and SMEM.
3426 bool IsLDS =
3427 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3428 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3429 LastAtomicWithSoftXcnt = nullptr;
3430
3431 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3432 MI.mayLoad() && MI.mayStore();
3433 MachineInstr &PrevMI = *MI.getPrevNode();
3434 // This is an atomic with a soft xcnt.
3435 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3436 // If we have already found an atomic with a soft xcnt, remove this soft
3437 // xcnt as it's redundant.
3438 if (LastAtomicWithSoftXcnt) {
3439 PrevMI.eraseFromParent();
3440 Modified = true;
3441 }
3442 LastAtomicWithSoftXcnt = &MI;
3443 }
3444 }
3445 return Modified;
3446}
3447
3448// Return flags indicating which counters should be flushed in the preheader.
3449PreheaderFlushFlags
3450SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3451 const WaitcntBrackets &ScoreBrackets) {
3452 auto [Iterator, IsInserted] =
3453 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3454 if (!IsInserted)
3455 return Iterator->second;
3456
3457 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3458 if (!Succ)
3459 return PreheaderFlushFlags();
3460
3461 MachineLoop *Loop = MLI.getLoopFor(Succ);
3462 if (!Loop)
3463 return PreheaderFlushFlags();
3464
3465 if (Loop->getLoopPreheader() == &MBB) {
3466 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3467 return Iterator->second;
3468 }
3469
3470 return PreheaderFlushFlags();
3471}
3472
3473bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3475 return TII.mayAccessVMEMThroughFlat(MI);
3476 return SIInstrInfo::isVMEM(MI);
3477}
3478
3479bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3480 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3481}
3482
3483// Check if instruction is a store to LDS that is counted via DSCNT
3484// (where that counter exists).
3485bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3486 return MI.mayStore() && SIInstrInfo::isDS(MI);
3487}
3488
3489// Return flags indicating which counters should be flushed in the preheader of
3490// the given loop. We currently decide to flush in the following situations:
3491// For VMEM (FlushVmCnt):
3492// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3493// vgpr containing a value that is loaded outside of the loop. (Only on
3494// targets with no vscnt counter).
3495// 2. The loop contains vmem load(s), but the loaded values are not used in the
3496// loop, and at least one use of a vgpr containing a value that is loaded
3497// outside of the loop.
3498// For DS (FlushDsCnt, GFX12+ only):
3499// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3500// a value that is DS read outside of the loop.
3501// 4. The loop contains DS read(s), loaded values are not used in the same
3502// iteration but in the next iteration (prefetch pattern), and at least one
3503// use of a vgpr containing a value that is DS read outside of the loop.
3504// Flushing in preheader reduces wait overhead if the wait requirement in
3505// iteration 1 would otherwise be more strict (but unfortunately preheader
3506// flush decision is taken before knowing that).
3507// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3508// tracking. Some DS reads may be used in the same iteration (creating
3509// "flush points"), but others remain unflushed at the backedge. When a DS
3510// read is consumed in the same iteration, it and all prior reads are
3511// "flushed" (FIFO order). No DS writes are allowed in the loop.
3512// TODO: Find a way to extend to multi-block loops.
3513PreheaderFlushFlags
3514SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3515 const WaitcntBrackets &Brackets) {
3516 PreheaderFlushFlags Flags;
3517 bool HasVMemLoad = false;
3518 bool HasVMemStore = false;
3519 bool UsesVgprVMEMLoadedOutside = false;
3520 bool UsesVgprDSReadOutside = false;
3521 bool VMemInvalidated = false;
3522 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3523 // Tracking status for "no DS read in loop" or "pure DS prefetch
3524 // (use only in next iteration)".
3525 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3526 DenseSet<MCRegUnit> VgprUse;
3527 DenseSet<MCRegUnit> VgprDefVMEM;
3528 DenseSet<MCRegUnit> VgprDefDS;
3529
3530 // Track DS reads for prefetch pattern with flush points (single-block only).
3531 // Keeps track of the last DS read (position counted from the top of the loop)
3532 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3533 // the dest register has a use or is overwritten (by any later opertions).
3534 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3535 unsigned DSReadPosition = 0;
3536 bool IsSingleBlock = ML->getNumBlocks() == 1;
3537 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3538 unsigned LastDSFlushPosition = 0;
3539
3540 for (MachineBasicBlock *MBB : ML->blocks()) {
3541 for (MachineInstr &MI : *MBB) {
3542 if (isVMEMOrFlatVMEM(MI)) {
3543 HasVMemLoad |= MI.mayLoad();
3544 HasVMemStore |= MI.mayStore();
3545 }
3546 // TODO: Can we relax DSStore check? There may be cases where
3547 // these DS stores are drained prior to the end of MBB (or loop).
3548 if (mayStoreIncrementingDSCNT(MI)) {
3549 // Early exit if none of the optimizations are feasible.
3550 // Otherwise, set tracking status appropriately and continue.
3551 if (VMemInvalidated)
3552 return Flags;
3553 TrackSimpleDSOpt = false;
3554 TrackDSFlushPoint = false;
3555 }
3556 bool IsDSRead = isDSRead(MI);
3557 if (IsDSRead)
3558 ++DSReadPosition;
3559
3560 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3561 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3562 if (!TrackDSFlushPoint)
3563 return;
3564 if (auto It = LastDSReadPositionMap.find(RU);
3565 It != LastDSReadPositionMap.end()) {
3566 // RU defined by DSRead is used or overwritten. Need to complete
3567 // the read, if not already implied by a later DSRead (to any RU)
3568 // needing to complete in FIFO order.
3569 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3570 }
3571 };
3572
3573 for (const MachineOperand &Op : MI.all_uses()) {
3574 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3575 continue;
3576 // Vgpr use
3577 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3578 // If we find a register that is loaded inside the loop, 1. and 2.
3579 // are invalidated.
3580 if (VgprDefVMEM.contains(RU))
3581 VMemInvalidated = true;
3582
3583 // Check for DS reads used inside the loop
3584 if (VgprDefDS.contains(RU))
3585 TrackSimpleDSOpt = false;
3586
3587 // Early exit if all optimizations are invalidated
3588 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3589 return Flags;
3590
3591 // Check for flush points (DS read used in same iteration)
3592 updateDSReadFlushTracking(RU);
3593
3594 VgprUse.insert(RU);
3595 // Check if this register has a pending VMEM load from outside the
3596 // loop (value loaded outside and used inside).
3597 VMEMID ID = toVMEMID(RU);
3598 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3599 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3600 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3601 UsesVgprVMEMLoadedOutside = true;
3602 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3603 // Only consider it a DS read if there's no pending VMEM load for
3604 // this register, since FLAT can set both counters.
3605 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3606 UsesVgprDSReadOutside = true;
3607 }
3608 }
3609
3610 // VMem load vgpr def
3611 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3612 for (const MachineOperand &Op : MI.all_defs()) {
3613 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3614 // If we find a register that is loaded inside the loop, 1. and 2.
3615 // are invalidated.
3616 if (VgprUse.contains(RU))
3617 VMemInvalidated = true;
3618 VgprDefVMEM.insert(RU);
3619 }
3620 }
3621 // Early exit if all optimizations are invalidated
3622 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3623 return Flags;
3624 }
3625
3626 // DS read vgpr def
3627 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3628 // If USE comes before DEF, it's the prefetch pattern (use value from
3629 // previous iteration, read for next iteration). We should still flush
3630 // in preheader so iteration 1 doesn't need to wait inside the loop.
3631 // Only invalidate when DEF comes before USE (same-iteration consumption,
3632 // checked above when processing uses).
3633 if (IsDSRead || TrackDSFlushPoint) {
3634 for (const MachineOperand &Op : MI.all_defs()) {
3635 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3636 continue;
3637 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3638 // Check for overwrite of pending DS read (flush point) by any
3639 // instruction
3640 updateDSReadFlushTracking(RU);
3641 if (IsDSRead) {
3642 VgprDefDS.insert(RU);
3643 if (TrackDSFlushPoint)
3644 LastDSReadPositionMap[RU] = DSReadPosition;
3645 }
3646 }
3647 }
3648 }
3649 }
3650 }
3651
3652 // VMEM flush decision
3653 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3654 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3655 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3656 Flags.FlushVmCnt = true;
3657
3658 // DS flush decision:
3659 // Simple DS Opt: flush if loop uses DS read values from outside
3660 // and either has no DS reads in the loop, or DS reads whose results
3661 // are not used in the loop.
3662 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3663 // Prefetch with flush points: some DS reads used in same iteration,
3664 // but unflushed reads remain at backedge
3665 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3666 bool DSFlushPointPrefetch =
3667 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3668
3669 if (SimpleDSOpt || DSFlushPointPrefetch)
3670 Flags.FlushDsCnt = true;
3671
3672 return Flags;
3673}
3674
3675bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3676 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3677 auto &PDT =
3678 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3679 AliasAnalysis *AA = nullptr;
3680 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3681 AA = &AAR->getAAResults();
3682
3683 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3684}
3685
3686PreservedAnalyses
3689 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3690 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3692 .getManager()
3693 .getCachedResult<AAManager>(MF.getFunction());
3694
3695 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3696 return PreservedAnalyses::all();
3697
3700 .preserve<AAManager>();
3701}
3702
3703bool SIInsertWaitcnts::run() {
3705
3707
3708 // Initialize hardware limits first, as they're needed by the generators.
3709 Limits = AMDGPU::HardwareLimits(IV);
3710
3711 if (ST.hasExtendedWaitCounts()) {
3712 IsExpertMode = ST.hasExpertSchedulingMode() &&
3713 (ExpertSchedulingModeFlag.getNumOccurrences()
3715 : MF.getFunction()
3716 .getFnAttribute("amdgpu-expert-scheduling-mode")
3717 .getValueAsBool());
3718 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3720 // Initialize WCG per MF. It contains state that depends on MF attributes.
3721 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3722 IsExpertMode);
3723 } else {
3724 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3725 // Initialize WCG per MF. It contains state that depends on MF attributes.
3726 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3727 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3728 }
3729
3730 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3731
3732 bool Modified = false;
3733
3734 MachineBasicBlock &EntryBB = MF.front();
3735
3736 if (!MFI->isEntryFunction() &&
3737 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3738 // Wait for any outstanding memory operations that the input registers may
3739 // depend on. We can't track them and it's better to do the wait after the
3740 // costly call sequence.
3741
3742 // TODO: Could insert earlier and schedule more liberally with operations
3743 // that only use caller preserved registers.
3745 while (I != EntryBB.end() && I->isMetaInstruction())
3746 ++I;
3747
3748 if (ST.hasExtendedWaitCounts()) {
3749 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3750 .addImm(0);
3752 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3753 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3754 CT == AMDGPU::ASYNC_CNT)
3755 continue;
3756
3757 if (!ST.hasImageInsts() &&
3758 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3759 CT == AMDGPU::BVH_CNT))
3760 continue;
3761
3762 BuildMI(EntryBB, I, DebugLoc(),
3763 TII.get(instrsForExtendedCounterTypes[CT]))
3764 .addImm(0);
3765 }
3766 if (IsExpertMode) {
3767 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3769 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3770 .addImm(Enc);
3771 }
3772 } else {
3773 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3774 }
3775
3776 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3777 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3778 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3779
3780 Modified = true;
3781 }
3782
3783 // Keep iterating over the blocks in reverse post order, inserting and
3784 // updating s_waitcnt where needed, until a fix point is reached.
3785 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3786 BlockInfos.try_emplace(MBB);
3787
3788 std::unique_ptr<WaitcntBrackets> Brackets;
3789 bool Repeat;
3790 do {
3791 Repeat = false;
3792
3793 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3794 ++BII) {
3795 MachineBasicBlock *MBB = BII->first;
3796 BlockInfo &BI = BII->second;
3797 if (!BI.Dirty)
3798 continue;
3799
3800 if (BI.Incoming) {
3801 if (!Brackets)
3802 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3803 else
3804 *Brackets = *BI.Incoming;
3805 } else {
3806 if (!Brackets) {
3807 Brackets = std::make_unique<WaitcntBrackets>(this);
3808 } else {
3809 // Reinitialize in-place. N.B. do not do this by assigning from a
3810 // temporary because the WaitcntBrackets class is large and it could
3811 // cause this function to use an unreasonable amount of stack space.
3812 Brackets->~WaitcntBrackets();
3813 new (Brackets.get()) WaitcntBrackets(this);
3814 }
3815 }
3816
3817 if (ST.hasWaitXcnt())
3818 Modified |= removeRedundantSoftXcnts(*MBB);
3819 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3820 BI.Dirty = false;
3821
3822 if (Brackets->hasPendingEvent()) {
3823 BlockInfo *MoveBracketsToSucc = nullptr;
3824 for (MachineBasicBlock *Succ : MBB->successors()) {
3825 auto *SuccBII = BlockInfos.find(Succ);
3826 BlockInfo &SuccBI = SuccBII->second;
3827 if (!SuccBI.Incoming) {
3828 SuccBI.Dirty = true;
3829 if (SuccBII <= BII) {
3830 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3831 Repeat = true;
3832 }
3833 if (!MoveBracketsToSucc) {
3834 MoveBracketsToSucc = &SuccBI;
3835 } else {
3836 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3837 }
3838 } else {
3839 LLVM_DEBUG({
3840 dbgs() << "Try to merge ";
3841 MBB->printName(dbgs());
3842 dbgs() << " into ";
3843 Succ->printName(dbgs());
3844 dbgs() << '\n';
3845 });
3846 if (SuccBI.Incoming->merge(*Brackets)) {
3847 SuccBI.Dirty = true;
3848 if (SuccBII <= BII) {
3849 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3850 Repeat = true;
3851 }
3852 }
3853 }
3854 }
3855 if (MoveBracketsToSucc)
3856 MoveBracketsToSucc->Incoming = std::move(Brackets);
3857 }
3858 }
3859 } while (Repeat);
3860
3861 if (ST.hasScalarStores()) {
3862 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3863 bool HaveScalarStores = false;
3864
3865 for (MachineBasicBlock &MBB : MF) {
3866 for (MachineInstr &MI : MBB) {
3867 if (!HaveScalarStores && TII.isScalarStore(MI))
3868 HaveScalarStores = true;
3869
3870 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3871 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3872 EndPgmBlocks.push_back(&MBB);
3873 }
3874 }
3875
3876 if (HaveScalarStores) {
3877 // If scalar writes are used, the cache must be flushed or else the next
3878 // wave to reuse the same scratch memory can be clobbered.
3879 //
3880 // Insert s_dcache_wb at wave termination points if there were any scalar
3881 // stores, and only if the cache hasn't already been flushed. This could
3882 // be improved by looking across blocks for flushes in postdominating
3883 // blocks from the stores but an explicitly requested flush is probably
3884 // very rare.
3885 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3886 bool SeenDCacheWB = false;
3887
3888 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3889 I != E; ++I) {
3890 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3891 SeenDCacheWB = true;
3892 else if (TII.isScalarStore(*I))
3893 SeenDCacheWB = false;
3894
3895 // FIXME: It would be better to insert this before a waitcnt if any.
3896 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3897 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3898 !SeenDCacheWB) {
3899 Modified = true;
3900 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3901 }
3902 }
3903 }
3904 }
3905 }
3906
3907 if (IsExpertMode) {
3908 // Enable expert scheduling on function entry. To satisfy ABI requirements
3909 // and to allow calls between function with different expert scheduling
3910 // settings, disable it around calls and before returns.
3911
3913 while (I != EntryBB.end() && I->isMetaInstruction())
3914 ++I;
3915 setSchedulingMode(EntryBB, I, true);
3916
3917 for (MachineInstr *MI : CallInsts) {
3918 MachineBasicBlock &MBB = *MI->getParent();
3919 setSchedulingMode(MBB, MI, false);
3920 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3921 }
3922
3923 for (MachineInstr *MI : ReturnInsts)
3924 setSchedulingMode(*MI->getParent(), MI, false);
3925
3926 Modified = true;
3927 }
3928
3929 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3930 // This is done in different ways depending on how the VGPRs were allocated
3931 // (i.e. whether we're in dynamic VGPR mode or not).
3932 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3933 // waveslot limited kernel runs slower with the deallocation.
3934 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3935 for (auto [MI, _] : EndPgmInsts) {
3936 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3937 TII.get(AMDGPU::S_ALLOC_VGPR))
3938 .addImm(0);
3939 Modified = true;
3940 }
3941 } else if (!WCG->isOptNone() &&
3942 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3943 (MF.getFrameInfo().hasCalls() ||
3944 ST.getOccupancyWithNumVGPRs(
3945 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3946 /*IsDynamicVGPR=*/false) <
3948 for (auto [MI, Flag] : EndPgmInsts) {
3949 if (Flag) {
3950 if (ST.requiresNopBeforeDeallocVGPRs()) {
3951 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3952 TII.get(AMDGPU::S_NOP))
3953 .addImm(0);
3954 }
3955 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3956 TII.get(AMDGPU::S_SENDMSG))
3958 Modified = true;
3959 }
3960 }
3961 }
3962
3963 return Modified;
3964}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2152
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2172
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.