LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUWaitcntUtils.h"
28#include "GCNSubtarget.h"
32#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/Sequence.h"
40#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
78 return Limits.LoadcntMax;
79 case AMDGPU::DS_CNT:
80 return Limits.DscntMax;
81 case AMDGPU::EXP_CNT:
82 return Limits.ExpcntMax;
84 return Limits.StorecntMax;
86 return Limits.SamplecntMax;
87 case AMDGPU::BVH_CNT:
88 return Limits.BvhcntMax;
89 case AMDGPU::KM_CNT:
90 return Limits.KmcntMax;
91 case AMDGPU::X_CNT:
92 return Limits.XcntMax;
93 case AMDGPU::VA_VDST:
94 return Limits.VaVdstMax;
95 case AMDGPU::VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102template <typename EmitWaitcntFn>
103static void EmitExpandedWaitcnt(unsigned Outstanding, unsigned Target,
104 EmitWaitcntFn &&EmitWaitcnt) {
105 // Emit waitcnts from (Outstanding - 1) down to Target.
106 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
107 EmitWaitcnt(I);
108 EmitWaitcnt(Target);
109}
110
111/// Integer IDs used to track vector memory locations we may have to wait on.
112/// Encoded as u16 chunks:
113///
114/// [0, REGUNITS_END ): MCRegUnit
115/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
116///
117/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
118/// It gives (2 << 16) - 1 entries per category which is more than enough
119/// for all register units. MCPhysReg is u16 so we don't even support >u16
120/// physical register numbers at this time, let alone >u16 register units.
121/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
122/// is enough for all register units.
123using VMEMID = uint32_t;
124
125enum : VMEMID {
126 TRACKINGID_RANGE_LEN = (1 << 16),
127
128 // Important: MCRegUnits must always be tracked starting from 0, as we
129 // need to be able to convert between a MCRegUnit and a VMEMID freely.
130 REGUNITS_BEGIN = 0,
131 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
132
133 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
134 // entry, which is updated for all LDS DMA operations encountered.
135 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
136 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
137 LDSDMA_BEGIN = REGUNITS_END,
138 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
139};
140
141/// Convert a MCRegUnit to a VMEMID.
142static constexpr VMEMID toVMEMID(MCRegUnit RU) {
143 return static_cast<unsigned>(RU);
144}
145
146#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
147 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
148 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
149 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
150 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
151 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
152 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
153 DECL(VMEM_GROUP) /* vmem group */ \
154 DECL(LDS_ACCESS) /* lds read & write */ \
155 DECL(GDS_ACCESS) /* gds read & write */ \
156 DECL(SQ_MESSAGE) /* send message */ \
157 DECL(SCC_WRITE) /* write to SCC from barrier */ \
158 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
159 DECL(SMEM_GROUP) /* scalar-memory group */ \
160 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
161 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
162 DECL(EXP_POS_ACCESS) /* write to export position */ \
163 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
164 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
165 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
166 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
167 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
168 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
169 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
170 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
171 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
172 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
173 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \
174 DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
175
176// clang-format off
177#define AMDGPU_EVENT_ENUM(Name) Name,
178enum WaitEventType {
180 NUM_WAIT_EVENTS
181};
182#undef AMDGPU_EVENT_ENUM
183} // namespace
184
185namespace llvm {
186template <> struct enum_iteration_traits<WaitEventType> {
187 static constexpr bool is_iterable = true;
188};
189} // namespace llvm
190
191namespace {
192
193/// Return an iterator over all events between VMEM_ACCESS (the first event)
194/// and \c MaxEvent (exclusive, default value yields an enumeration over
195/// all counters).
196auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
197 return enum_seq(VMEM_ACCESS, MaxEvent);
198}
199
200#define AMDGPU_EVENT_NAME(Name) #Name,
201static constexpr StringLiteral WaitEventTypeName[] = {
203};
204#undef AMDGPU_EVENT_NAME
205static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
206 return WaitEventTypeName[Event];
207}
208// clang-format on
209
210// Enumerate different types of result-returning VMEM operations. Although
211// s_waitcnt orders them all with a single vmcnt counter, in the absence of
212// s_waitcnt only instructions of the same VmemType are guaranteed to write
213// their results in order -- so there is no need to insert an s_waitcnt between
214// two instructions of the same type that write the same vgpr.
215enum VmemType {
216 // BUF instructions and MIMG instructions without a sampler.
217 VMEM_NOSAMPLER,
218 // MIMG instructions with a sampler.
219 VMEM_SAMPLER,
220 // BVH instructions
221 VMEM_BVH,
222 NUM_VMEM_TYPES
223};
224
225// Maps values of InstCounterType to the instruction that waits on that
226// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
227// returns true, and does not cover VA_VDST or VM_VSRC.
228static const unsigned
229 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
230 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
231 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
232 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
233 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
234 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
235
236// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
237// code but still need to be processed by this pass for async vmcnt tracking.
238static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
239 switch (MI.getOpcode()) {
240 case AMDGPU::ASYNCMARK:
241 case AMDGPU::WAIT_ASYNCMARK:
242 return false;
243 default:
244 return MI.isMetaInstruction();
245 }
246}
247
248static bool updateVMCntOnly(const MachineInstr &Inst) {
249 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
251}
252
253#ifndef NDEBUG
254static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
255 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
256}
257#endif // NDEBUG
258
259VmemType getVmemType(const MachineInstr &Inst) {
260 assert(updateVMCntOnly(Inst));
261 if (!SIInstrInfo::isImage(Inst))
262 return VMEM_NOSAMPLER;
263 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
264 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
265 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
266
267 if (BaseInfo->BVH)
268 return VMEM_BVH;
269
270 // We have to make an additional check for isVSAMPLE here since some
271 // instructions don't have a sampler, but are still classified as sampler
272 // instructions for the purposes of e.g. waitcnt.
273 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
274 return VMEM_SAMPLER;
275
276 return VMEM_NOSAMPLER;
277}
278
279void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
280 Wait.set(T, std::min(Wait.get(T), Count));
281}
282
284 Wait.set(T, ~0u);
285}
286
287/// A small set of events.
288class WaitEventSet {
289 unsigned Mask = 0;
290
291public:
292 WaitEventSet() = default;
293 explicit constexpr WaitEventSet(WaitEventType Event) {
294 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
295 "Not enough bits in Mask for all the events");
296 Mask |= 1 << Event;
297 }
298 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
299 for (auto &E : Events) {
300 Mask |= 1 << E;
301 }
302 }
303 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
304 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
305 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
306 bool contains(const WaitEventType &Event) const {
307 return Mask & (1 << Event);
308 }
309 /// \Returns true if this set contains all elements of \p Other.
310 bool contains(const WaitEventSet &Other) const {
311 return (~Mask & Other.Mask) == 0;
312 }
313 /// \Returns the intersection of this and \p Other.
314 WaitEventSet operator&(const WaitEventSet &Other) const {
315 auto Copy = *this;
316 Copy.Mask &= Other.Mask;
317 return Copy;
318 }
319 /// \Returns the union of this and \p Other.
320 WaitEventSet operator|(const WaitEventSet &Other) const {
321 auto Copy = *this;
322 Copy.Mask |= Other.Mask;
323 return Copy;
324 }
325 /// This set becomes the union of this and \p Other.
326 WaitEventSet &operator|=(const WaitEventSet &Other) {
327 Mask |= Other.Mask;
328 return *this;
329 }
330 /// This set becomes the intersection of this and \p Other.
331 WaitEventSet &operator&=(const WaitEventSet &Other) {
332 Mask &= Other.Mask;
333 return *this;
334 }
335 bool operator==(const WaitEventSet &Other) const {
336 return Mask == Other.Mask;
337 }
338 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
339 bool empty() const { return Mask == 0; }
340 /// \Returns true if the set contains more than one element.
341 bool twoOrMore() const { return Mask & (Mask - 1); }
342 operator bool() const { return !empty(); }
343 void print(raw_ostream &OS) const {
344 ListSeparator LS(", ");
345 for (WaitEventType Event : wait_events()) {
346 if (contains(Event))
347 OS << LS << getWaitEventTypeName(Event);
348 }
349 }
350 LLVM_DUMP_METHOD void dump() const;
351};
352
353void WaitEventSet::dump() const {
354 print(dbgs());
355 dbgs() << "\n";
356}
357
358class WaitcntBrackets;
359
360// This abstracts the logic for generating and updating S_WAIT* instructions
361// away from the analysis that determines where they are needed. This was
362// done because the set of counters and instructions for waiting on them
363// underwent a major shift with gfx12, sufficiently so that having this
364// abstraction allows the main analysis logic to be simpler than it would
365// otherwise have had to become.
366class WaitcntGenerator {
367protected:
368 const GCNSubtarget &ST;
369 const SIInstrInfo &TII;
370 AMDGPU::IsaVersion IV;
371 AMDGPU::InstCounterType MaxCounter;
372 bool OptNone;
373 bool ExpandWaitcntProfiling = false;
374 const AMDGPU::HardwareLimits &Limits;
375
376public:
377 WaitcntGenerator() = delete;
378 WaitcntGenerator(const WaitcntGenerator &) = delete;
379 WaitcntGenerator(const MachineFunction &MF,
380 AMDGPU::InstCounterType MaxCounter,
381 const AMDGPU::HardwareLimits &Limits)
382 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
383 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
384 OptNone(MF.getFunction().hasOptNone() ||
385 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
386 ExpandWaitcntProfiling(
387 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
388 Limits(Limits) {}
389
390 // Return true if the current function should be compiled with no
391 // optimization.
392 bool isOptNone() const { return OptNone; }
393
394 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
395
396 // Edits an existing sequence of wait count instructions according
397 // to an incoming Waitcnt value, which is itself updated to reflect
398 // any new wait count instructions which may need to be generated by
399 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
400 // were made.
401 //
402 // This editing will usually be merely updated operands, but it may also
403 // delete instructions if the incoming Wait value indicates they are not
404 // needed. It may also remove existing instructions for which a wait
405 // is needed if it can be determined that it is better to generate new
406 // instructions later, as can happen on gfx12.
407 virtual bool
408 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
409 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
411
412 // Transform a soft waitcnt into a normal one.
413 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
414
415 // Generates new wait count instructions according to the value of
416 // Wait, returning true if any new instructions were created.
417 // ScoreBrackets is used for profiling expansion.
418 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
420 AMDGPU::Waitcnt Wait,
421 const WaitcntBrackets &ScoreBrackets) = 0;
422
423 // Returns the WaitEventSet that corresponds to counter \p T.
424 virtual const WaitEventSet &
425 getWaitEvents(AMDGPU::InstCounterType T) const = 0;
426
427 /// \returns the counter that corresponds to event \p E.
428 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
429 for (auto T : AMDGPU::inst_counter_types()) {
430 if (getWaitEvents(T).contains(E))
431 return T;
432 }
433 llvm_unreachable("event type has no associated counter");
434 }
435
436 // Returns a new waitcnt with all counters except VScnt set to 0. If
437 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
438 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
439 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
440 // processed.
441 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
442
443 virtual ~WaitcntGenerator() = default;
444};
445
446class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
447 static constexpr const WaitEventSet
448 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
449 WaitEventSet(
450 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
451 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
452 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
453 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
454 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
455 WaitEventSet(),
456 WaitEventSet(),
457 WaitEventSet(),
458 WaitEventSet(),
459 WaitEventSet(),
460 WaitEventSet(),
461 WaitEventSet(),
462 WaitEventSet()};
463
464public:
465 using WaitcntGenerator::WaitcntGenerator;
466 bool
467 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
468 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
469 MachineBasicBlock::instr_iterator It) const override;
470
471 bool createNewWaitcnt(MachineBasicBlock &Block,
473 AMDGPU::Waitcnt Wait,
474 const WaitcntBrackets &ScoreBrackets) override;
475
476 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
477 return WaitEventMaskForInstPreGFX12[T];
478 }
479
480 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
481};
482
483class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
484protected:
485 bool IsExpertMode;
486 static constexpr const WaitEventSet
487 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
488 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
489 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
490 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
491 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
492 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
493 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
494 WaitEventSet({VMEM_BVH_READ_ACCESS}),
495 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
496 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
497 WaitEventSet({ASYNC_ACCESS}),
498 WaitEventSet({TENSOR_ACCESS}),
499 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
500 VGPR_XDL_WRITE}),
501 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
502
503public:
504 WaitcntGeneratorGFX12Plus() = delete;
505 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
506 AMDGPU::InstCounterType MaxCounter,
507 const AMDGPU::HardwareLimits &Limits,
508 bool IsExpertMode)
509 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
510
511 bool
512 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
513 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
514 MachineBasicBlock::instr_iterator It) const override;
515
516 bool createNewWaitcnt(MachineBasicBlock &Block,
518 AMDGPU::Waitcnt Wait,
519 const WaitcntBrackets &ScoreBrackets) override;
520
521 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
522 return WaitEventMaskForInstGFX12Plus[T];
523 }
524
525 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
526};
527
528// Flags indicating which counters should be flushed in a loop preheader.
529struct PreheaderFlushFlags {
530 bool FlushVmCnt = false;
531 bool FlushDsCnt = false;
532};
533
534class SIInsertWaitcnts {
535 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
536 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
537 MachineLoopInfo &MLI;
538 MachinePostDominatorTree &PDT;
539 AliasAnalysis *AA = nullptr;
540 MachineFunction &MF;
541
542 struct BlockInfo {
543 std::unique_ptr<WaitcntBrackets> Incoming;
544 bool Dirty = true;
545 BlockInfo() = default;
546 BlockInfo(BlockInfo &&) = default;
547 BlockInfo &operator=(BlockInfo &&) = default;
548 ~BlockInfo();
549 };
550
551 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
552
553 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
554
555 std::unique_ptr<WaitcntGenerator> WCG;
556
557 // Remember call and return instructions in the function.
558 DenseSet<MachineInstr *> CallInsts;
559 DenseSet<MachineInstr *> ReturnInsts;
560
561 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
562 // be outstanding stores but definitely no outstanding scratch stores, to help
563 // with insertion of DEALLOC_VGPRS messages.
564 DenseMap<MachineInstr *, bool> EndPgmInsts;
565
566 AMDGPU::HardwareLimits Limits;
567
568public:
569 const GCNSubtarget &ST;
570 const SIInstrInfo &TII;
571 const SIRegisterInfo &TRI;
572 const MachineRegisterInfo &MRI;
573 AMDGPU::InstCounterType SmemAccessCounter;
574 AMDGPU::InstCounterType MaxCounter;
575 bool IsExpertMode = false;
576
577 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
578 AliasAnalysis *AA, MachineFunction &MF)
579 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
580 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
581 MRI(MF.getRegInfo()) {
582 (void)ForceExpCounter;
583 (void)ForceLgkmCounter;
584 (void)ForceVMCounter;
585 }
586
587 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
588
589 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
590 const WaitcntBrackets &Brackets);
591 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
592 const WaitcntBrackets &ScoreBrackets);
593 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
594 bool isDSRead(const MachineInstr &MI) const;
595 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
596 bool run();
597
598 void setForceEmitWaitcnt() {
599// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
600// For debug builds, get the debug counter info and adjust if need be
601#ifndef NDEBUG
602 if (DebugCounter::isCounterSet(ForceExpCounter) &&
603 DebugCounter::shouldExecute(ForceExpCounter)) {
604 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
605 } else {
606 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
607 }
608
609 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
610 DebugCounter::shouldExecute(ForceLgkmCounter)) {
611 ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
612 ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
613 } else {
614 ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
615 ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
616 }
617
618 if (DebugCounter::isCounterSet(ForceVMCounter) &&
619 DebugCounter::shouldExecute(ForceVMCounter)) {
620 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
621 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
622 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
623 } else {
624 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
625 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
626 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
627 }
628
629 ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
630 ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
631#endif // NDEBUG
632 }
633
634 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
635 // instruction.
636 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
637 switch (Inst.getOpcode()) {
638 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
639 case AMDGPU::GLOBAL_INV:
640 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
641 // VGPRs
642 case AMDGPU::GLOBAL_WB:
643 case AMDGPU::GLOBAL_WBINV:
644 return VMEM_WRITE_ACCESS; // tracked using storecnt
645 default:
646 break;
647 }
648
649 // Maps VMEM access types to their corresponding WaitEventType.
650 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
651 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
652
654 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
655 // these should use VM_CNT.
656 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
657 return VMEM_ACCESS;
658 if (Inst.mayStore() &&
659 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
660 if (TII.mayAccessScratch(Inst))
661 return SCRATCH_WRITE_ACCESS;
662 return VMEM_WRITE_ACCESS;
663 }
664 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
665 return VMEM_ACCESS;
666 return VmemReadMapping[getVmemType(Inst)];
667 }
668
669 std::optional<WaitEventType>
670 getExpertSchedulingEventType(const MachineInstr &Inst) const;
671
672 bool isAsync(const MachineInstr &MI) const {
674 return false;
676 return true;
677 const MachineOperand *Async =
678 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
679 return Async && (Async->getImm());
680 }
681
682 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
683 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
684 }
685
686 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
687 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
688 }
689
690 bool shouldUpdateAsyncMark(const MachineInstr &MI,
693 return T == AMDGPU::TENSOR_CNT;
694 if (!isAsyncLdsDmaWrite(MI))
695 return false;
697 return T == AMDGPU::ASYNC_CNT;
698 return T == AMDGPU::LOAD_CNT;
699 }
700
701 bool isVmemAccess(const MachineInstr &MI) const;
702 bool generateWaitcntInstBefore(MachineInstr &MI,
703 WaitcntBrackets &ScoreBrackets,
704 MachineInstr *OldWaitcntInstr,
705 PreheaderFlushFlags FlushFlags);
706 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
708 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
709 MachineInstr *OldWaitcntInstr);
710 /// \returns all events that correspond to \p Inst.
711 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
712 void updateEventWaitcntAfter(MachineInstr &Inst,
713 WaitcntBrackets *ScoreBrackets);
714 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
715 MachineBasicBlock *Block) const;
716 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
717 WaitcntBrackets &ScoreBrackets);
718 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
719 WaitcntBrackets &ScoreBrackets);
720 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
721 /// Legalizer. Returns true if block was modified.
722 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
723 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
724 bool ExpertMode) const;
725 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
726 return WCG->getWaitEvents(T);
727 }
728 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
729 return WCG->getCounterFromEvent(E);
730 }
731};
732
733// This objects maintains the current score brackets of each wait counter, and
734// a per-register scoreboard for each wait counter.
735//
736// We also maintain the latest score for every event type that can change the
737// waitcnt in order to know if there are multiple types of events within
738// the brackets. When multiple types of event happen in the bracket,
739// wait count may get decreased out of order, therefore we need to put in
740// "s_waitcnt 0" before use.
741class WaitcntBrackets {
742public:
743 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
744 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
745 }
746
747#ifndef NDEBUG
748 ~WaitcntBrackets() {
749 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
750 for (auto &[ID, Val] : VMem) {
751 if (Val.empty())
752 ++NumUnusedVmem;
753 }
754 for (auto &[ID, Val] : SGPRs) {
755 if (Val.empty())
756 ++NumUnusedSGPRs;
757 }
758
759 if (NumUnusedVmem || NumUnusedSGPRs) {
760 errs() << "WaitcntBracket had unused entries at destruction time: "
761 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
762 << " SGPR unused entries\n";
763 std::abort();
764 }
765 }
766#endif
767
768 bool isSmemCounter(AMDGPU::InstCounterType T) const {
769 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
770 }
771
772 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
773 return ScoreUBs[T] - ScoreLBs[T];
774 }
775
776 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
777 return getVMemScore(ID, T) > getScoreLB(T);
778 }
779
780 /// \Return true if we have no score entries for counter \p T.
781 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
782
783private:
784 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
786 return ScoreLBs[T];
787 }
788
789 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
791 return ScoreUBs[T];
792 }
793
794 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
795 return getScoreUB(T) - getScoreLB(T);
796 }
797
798 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
799 auto It = SGPRs.find(RU);
800 return It != SGPRs.end() ? It->second.get(T) : 0;
801 }
802
803 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
804 auto It = VMem.find(TID);
805 return It != VMem.end() ? It->second.Scores[T] : 0;
806 }
807
808public:
809 bool merge(const WaitcntBrackets &Other);
810
811 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
812 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
813 simplifyWaitcnt(Wait, Wait);
814 }
815 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
816 AMDGPU::Waitcnt &UpdateWait) const;
817 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
818 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
819 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
820 AMDGPU::Waitcnt &UpdateWait) const;
821 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
822 AMDGPU::Waitcnt &UpdateWait) const;
823
824 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
825 AMDGPU::Waitcnt &Wait,
826 const MachineInstr &MI) const;
827 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
829 MCPhysReg Reg) const;
830 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
831 AMDGPU::Waitcnt &Wait) const;
832 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
833 void tryClearSCCWriteEvent(MachineInstr *Inst);
834
835 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
836 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
837 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
838 void updateByEvent(WaitEventType E, MachineInstr &MI);
839 void recordAsyncMark(MachineInstr &MI);
840
841 bool hasPendingEvent() const { return !PendingEvents.empty(); }
842 bool hasPendingEvent(WaitEventType E) const {
843 return PendingEvents.contains(E);
844 }
845 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
846 bool HasPending = PendingEvents & Context->getWaitEvents(T);
847 assert(HasPending == !empty(T) &&
848 "Expected pending events iff scoreboard is not empty");
849 return HasPending;
850 }
851
852 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
853 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
854 // Return true if more than one bit is set in Events.
855 return Events.twoOrMore();
856 }
857
858 bool hasPendingFlat() const {
859 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
860 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
861 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
862 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
863 }
864
865 void setPendingFlat() {
866 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
867 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
868 }
869
870 bool hasPendingGDS() const {
871 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
872 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
873 }
874
875 unsigned getPendingGDSWait() const {
876 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
877 getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
878 }
879
880 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
881
882 // Return true if there might be pending writes to the vgpr-interval by VMEM
883 // instructions with types different from V.
884 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
885 for (MCRegUnit RU : regunits(Reg)) {
886 auto It = VMem.find(toVMEMID(RU));
887 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
888 return true;
889 }
890 return false;
891 }
892
893 void clearVgprVmemTypes(MCPhysReg Reg) {
894 for (MCRegUnit RU : regunits(Reg)) {
895 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
896 It->second.VMEMTypes = 0;
897 if (It->second.empty())
898 VMem.erase(It);
899 }
900 }
901 }
902
903 void setStateOnFunctionEntryOrReturn() {
904 setScoreUB(AMDGPU::STORE_CNT,
905 getScoreUB(AMDGPU::STORE_CNT) +
906 getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
907 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
908 }
909
910 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
911 return LDSDMAStores;
912 }
913
914 bool hasPointSampleAccel(const MachineInstr &MI) const;
915 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
916 MCPhysReg RU) const;
917
918 void print(raw_ostream &) const;
919 void dump() const { print(dbgs()); }
920
921 // Free up memory by removing empty entries from the DenseMap that track event
922 // scores.
923 void purgeEmptyTrackingData();
924
925private:
926 struct MergeInfo {
927 unsigned OldLB;
928 unsigned OtherLB;
929 unsigned MyShift;
930 unsigned OtherShift;
931 };
932
933 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
934
935 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
936 AMDGPU::Waitcnt &Wait) const;
937
938 static bool mergeScore(const MergeInfo &M, unsigned &Score,
939 unsigned OtherScore);
940 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
941 ArrayRef<CounterValueArray> OtherMarks);
942
944 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
945 if (!Context->TRI.isInAllocatableClass(Reg))
946 return {{}, {}};
947 return Context->TRI.regunits(Reg);
948 }
949
950 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
952 ScoreLBs[T] = Val;
953 }
954
955 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
957 ScoreUBs[T] = Val;
958
959 if (T != AMDGPU::EXP_CNT)
960 return;
961
962 if (getScoreRange(AMDGPU::EXP_CNT) >
963 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
964 ScoreLBs[AMDGPU::EXP_CNT] =
965 ScoreUBs[AMDGPU::EXP_CNT] -
966 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
967 }
968
969 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
970 const SIRegisterInfo &TRI = Context->TRI;
971 if (Reg == AMDGPU::SCC) {
972 SCCScore = Val;
973 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
974 for (MCRegUnit RU : regunits(Reg))
975 VMem[toVMEMID(RU)].Scores[T] = Val;
976 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
977 for (MCRegUnit RU : regunits(Reg))
978 SGPRs[RU].get(T) = Val;
979 } else {
980 llvm_unreachable("Register cannot be tracked/unknown register!");
981 }
982 }
983
984 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
985 VMem[TID].Scores[T] = Val;
986 }
987
988 void setScoreByOperand(const MachineOperand &Op,
989 AMDGPU::InstCounterType CntTy, unsigned Val);
990
991 const SIInsertWaitcnts *Context;
992
993 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
994 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
995 WaitEventSet PendingEvents;
996 // Remember the last flat memory operation.
997 unsigned LastFlatDsCnt = 0;
998 unsigned LastFlatLoadCnt = 0;
999 // Remember the last GDS operation.
1000 unsigned LastGDS = 0;
1001
1002 // The score tracking logic is fragmented as follows:
1003 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
1004 // - SGPRs: SGPR RegUnits
1005 // - SCC: Non-allocatable and not general purpose: not a SGPR.
1006 //
1007 // For the VMem case, if the key is within the range of LDS DMA IDs,
1008 // then the corresponding index into the `LDSDMAStores` vector below is:
1009 // Key - LDSDMA_BEGIN - 1
1010 // This is because LDSDMA_BEGIN is a generic entry and does not have an
1011 // associated MachineInstr.
1012 //
1013 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
1014
1015 struct VMEMInfo {
1016 // Scores for all instruction counters. Zero-initialized.
1017 CounterValueArray Scores{};
1018 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
1019 unsigned VMEMTypes = 0;
1020
1021 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1022 };
1023
1024 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1025 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1026 class SGPRInfo {
1027 /// Either DS_CNT or KM_CNT score.
1028 unsigned ScoreDsKmCnt = 0;
1029 unsigned ScoreXCnt = 0;
1030
1031 public:
1032 unsigned get(AMDGPU::InstCounterType T) const {
1033 assert(
1034 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1035 "Invalid counter");
1036 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1037 }
1038 unsigned &get(AMDGPU::InstCounterType T) {
1039 assert(
1040 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1041 "Invalid counter");
1042 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1043 }
1044
1045 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
1046 };
1047
1048 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1049 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1050
1051 // Reg score for SCC.
1052 unsigned SCCScore = 0;
1053 // The unique instruction that has an SCC write pending, if there is one.
1054 const MachineInstr *PendingSCCWrite = nullptr;
1055
1056 // Store representative LDS DMA operations. The only useful info here is
1057 // alias info. One store is kept per unique AAInfo.
1058 SmallVector<const MachineInstr *> LDSDMAStores;
1059
1060 // State of all counters at each async mark encountered so far.
1062
1063 // But in the rare pathological case, a nest of loops that pushes marks
1064 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1065 // it to a reasonable limit. We can tune this later or potentially introduce a
1066 // user option to control the value.
1067 static constexpr unsigned MaxAsyncMarks = 16;
1068
1069 // Track the upper bound score for async operations that are not part of a
1070 // mark yet. Initialized to all zeros.
1071 CounterValueArray AsyncScore{};
1072};
1073
1074SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
1075
1076class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1077public:
1078 static char ID;
1079 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1080
1081 bool runOnMachineFunction(MachineFunction &MF) override;
1082
1083 StringRef getPassName() const override {
1084 return "SI insert wait instructions";
1085 }
1086
1087 void getAnalysisUsage(AnalysisUsage &AU) const override {
1088 AU.setPreservesCFG();
1089 AU.addRequired<MachineLoopInfoWrapperPass>();
1090 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1091 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1092 AU.addPreserved<AAResultsWrapperPass>();
1094 }
1095};
1096
1097} // end anonymous namespace
1098
1099void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1101 unsigned Score) {
1102 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1103}
1104
1105// Return true if the subtarget is one that enables Point Sample Acceleration
1106// and the MachineInstr passed in is one to which it might be applied (the
1107// hardware makes this decision based on several factors, but we can't determine
1108// this at compile time, so we have to assume it might be applied if the
1109// instruction supports it).
1110bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1111 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1112 return false;
1113
1114 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1115 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1117 return BaseInfo->PointSampleAccel;
1118}
1119
1120// Return true if the subtarget enables Point Sample Acceleration, the supplied
1121// MachineInstr is one to which it might be applied and the supplied interval is
1122// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1123// (this is the type that a point sample accelerated instruction effectively
1124// becomes)
1125bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1126 MCPhysReg Reg) const {
1127 if (!hasPointSampleAccel(MI))
1128 return false;
1129
1130 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1131}
1132
1133void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1134 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
1135 assert(T < Context->MaxCounter);
1136
1137 unsigned UB = getScoreUB(T);
1138 unsigned Increment = 1;
1140 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
1141 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
1142 // two VOP3P instructions and increments VA_VDST twice.
1143 Increment = 2;
1144 }
1145 unsigned CurrScore = UB + Increment;
1146 if (CurrScore == 0)
1147 report_fatal_error("InsertWaitcnt score wraparound");
1148 // PendingEvents and ScoreUB need to be update regardless if this event
1149 // changes the score of a register or not.
1150 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1151 PendingEvents.insert(E);
1152 setScoreUB(T, CurrScore);
1153
1154 const SIRegisterInfo &TRI = Context->TRI;
1155 const MachineRegisterInfo &MRI = Context->MRI;
1156 const SIInstrInfo &TII = Context->TII;
1157
1158 if (T == AMDGPU::EXP_CNT) {
1159 // Put score on the source vgprs. If this is a store, just use those
1160 // specific register(s).
1161 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1162 // All GDS operations must protect their address register (same as
1163 // export.)
1164 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1165 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
1166
1167 if (Inst.mayStore()) {
1168 if (const auto *Data0 =
1169 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1170 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
1171 if (const auto *Data1 =
1172 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1173 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
1174 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1175 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1176 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1177 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1178 for (const MachineOperand &Op : Inst.all_uses()) {
1179 if (TRI.isVectorRegister(MRI, Op.getReg()))
1180 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1181 }
1182 }
1183 } else if (TII.isFLAT(Inst)) {
1184 if (Inst.mayStore()) {
1185 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1186 AMDGPU::EXP_CNT, CurrScore);
1187 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1188 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1189 AMDGPU::EXP_CNT, CurrScore);
1190 }
1191 } else if (TII.isMIMG(Inst)) {
1192 if (Inst.mayStore()) {
1193 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1194 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1195 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1196 AMDGPU::EXP_CNT, CurrScore);
1197 }
1198 } else if (TII.isMTBUF(Inst)) {
1199 if (Inst.mayStore())
1200 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1201 } else if (TII.isMUBUF(Inst)) {
1202 if (Inst.mayStore()) {
1203 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1204 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1205 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1206 AMDGPU::EXP_CNT, CurrScore);
1207 }
1208 } else if (TII.isLDSDIR(Inst)) {
1209 // LDSDIR instructions attach the score to the destination.
1210 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1211 AMDGPU::EXP_CNT, CurrScore);
1212 } else {
1213 if (TII.isEXP(Inst)) {
1214 // For export the destination registers are really temps that
1215 // can be used as the actual source after export patching, so
1216 // we need to treat them like sources and set the EXP_CNT
1217 // score.
1218 for (MachineOperand &DefMO : Inst.all_defs()) {
1219 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1220 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
1221 }
1222 }
1223 }
1224 for (const MachineOperand &Op : Inst.all_uses()) {
1225 if (TRI.isVectorRegister(MRI, Op.getReg()))
1226 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1227 }
1228 }
1229 } else if (T == AMDGPU::X_CNT) {
1230 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1231 if (PendingEvents.contains(OtherEvent)) {
1232 // Hardware inserts an implicit xcnt between interleaved
1233 // SMEM and VMEM operations. So there will never be
1234 // outstanding address translations for both SMEM and
1235 // VMEM at the same time.
1236 setScoreLB(T, getScoreUB(T) - 1);
1237 PendingEvents.remove(OtherEvent);
1238 }
1239 for (const MachineOperand &Op : Inst.all_uses())
1240 setScoreByOperand(Op, T, CurrScore);
1241 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1242 // Match the score to the VGPR destination or source registers as
1243 // appropriate
1244 for (const MachineOperand &Op : Inst.operands()) {
1245 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1246 (T == AMDGPU::VM_VSRC && Op.isDef()))
1247 continue;
1248 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1249 setScoreByOperand(Op, T, CurrScore);
1250 }
1251 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1252 // Match the score to the destination registers.
1253 //
1254 // Check only explicit operands. Stores, especially spill stores, include
1255 // implicit uses and defs of their super registers which would create an
1256 // artificial dependency, while these are there only for register liveness
1257 // accounting purposes.
1258 //
1259 // Special cases where implicit register defs exists, such as M0 or VCC,
1260 // but none with memory instructions.
1261 for (const MachineOperand &Op : Inst.defs()) {
1262 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1263 T == AMDGPU::BVH_CNT) {
1264 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1265 continue;
1266 if (updateVMCntOnly(Inst)) {
1267 // updateVMCntOnly should only leave us with VGPRs
1268 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1269 // defs. That's required for a sane index into `VgprMemTypes` below
1270 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1271 VmemType V = getVmemType(Inst);
1272 unsigned char TypesMask = 1 << V;
1273 // If instruction can have Point Sample Accel applied, we have to flag
1274 // this with another potential dependency
1275 if (hasPointSampleAccel(Inst))
1276 TypesMask |= 1 << VMEM_NOSAMPLER;
1277 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1278 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1279 }
1280 }
1281 setScoreByOperand(Op, T, CurrScore);
1282 }
1283 if (Inst.mayStore() &&
1284 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1285 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1286 // written can be accessed. A load from LDS to VMEM does not need a wait.
1287 //
1288 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1289 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1290 // store. The "Slot" is the index into LDSDMAStores + 1.
1291 unsigned Slot = 0;
1292 for (const auto *MemOp : Inst.memoperands()) {
1293 if (!MemOp->isStore() ||
1294 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1295 continue;
1296 // Comparing just AA info does not guarantee memoperands are equal
1297 // in general, but this is so for LDS DMA in practice.
1298 auto AAI = MemOp->getAAInfo();
1299 // Alias scope information gives a way to definitely identify an
1300 // original memory object and practically produced in the module LDS
1301 // lowering pass. If there is no scope available we will not be able
1302 // to disambiguate LDS aliasing as after the module lowering all LDS
1303 // is squashed into a single big object.
1304 if (!AAI || !AAI.Scope)
1305 break;
1306 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1307 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1308 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1309 Slot = I + 1;
1310 break;
1311 }
1312 }
1313 }
1314 if (Slot)
1315 break;
1316 // The slot may not be valid because it can be >= NUM_LDSDMA which
1317 // means the scoreboard cannot track it. We still want to preserve the
1318 // MI in order to check alias information, though.
1319 LDSDMAStores.push_back(&Inst);
1320 Slot = LDSDMAStores.size();
1321 break;
1322 }
1323 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1324 if (Slot && Slot < NUM_LDSDMA)
1325 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1326 }
1327
1328 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1329 AsyncScore[T] = CurrScore;
1330 }
1331
1333 setRegScore(AMDGPU::SCC, T, CurrScore);
1334 PendingSCCWrite = &Inst;
1335 }
1336 }
1337}
1338
1339void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1340 // In the absence of loops, AsyncMarks can grow linearly with the program
1341 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1342 // limit every time we push a new mark, but that seems like unnecessary work
1343 // in practical cases. We do separately truncate the array when processing a
1344 // loop, which should be sufficient.
1345 AsyncMarks.push_back(AsyncScore);
1346 AsyncScore = {};
1347 LLVM_DEBUG({
1348 dbgs() << "recordAsyncMark:\n" << Inst;
1349 for (const auto &Mark : AsyncMarks) {
1350 llvm::interleaveComma(Mark, dbgs());
1351 dbgs() << '\n';
1352 }
1353 });
1354}
1355
1356void WaitcntBrackets::print(raw_ostream &OS) const {
1357 const GCNSubtarget &ST = Context->ST;
1358
1359 for (auto T : inst_counter_types(Context->MaxCounter)) {
1360 unsigned SR = getScoreRange(T);
1361 switch (T) {
1362 case AMDGPU::LOAD_CNT:
1363 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1364 << SR << "):";
1365 break;
1366 case AMDGPU::DS_CNT:
1367 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1368 << SR << "):";
1369 break;
1370 case AMDGPU::EXP_CNT:
1371 OS << " EXP_CNT(" << SR << "):";
1372 break;
1373 case AMDGPU::STORE_CNT:
1374 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1375 << SR << "):";
1376 break;
1377 case AMDGPU::SAMPLE_CNT:
1378 OS << " SAMPLE_CNT(" << SR << "):";
1379 break;
1380 case AMDGPU::BVH_CNT:
1381 OS << " BVH_CNT(" << SR << "):";
1382 break;
1383 case AMDGPU::KM_CNT:
1384 OS << " KM_CNT(" << SR << "):";
1385 break;
1386 case AMDGPU::X_CNT:
1387 OS << " X_CNT(" << SR << "):";
1388 break;
1389 case AMDGPU::ASYNC_CNT:
1390 OS << " ASYNC_CNT(" << SR << "):";
1391 break;
1392 case AMDGPU::VA_VDST:
1393 OS << " VA_VDST(" << SR << "): ";
1394 break;
1395 case AMDGPU::VM_VSRC:
1396 OS << " VM_VSRC(" << SR << "): ";
1397 break;
1398 default:
1399 OS << " UNKNOWN(" << SR << "):";
1400 break;
1401 }
1402
1403 if (SR != 0) {
1404 // Print vgpr scores.
1405 unsigned LB = getScoreLB(T);
1406
1407 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1408 sort(SortedVMEMIDs);
1409
1410 for (auto ID : SortedVMEMIDs) {
1411 unsigned RegScore = VMem.at(ID).Scores[T];
1412 if (RegScore <= LB)
1413 continue;
1414 unsigned RelScore = RegScore - LB - 1;
1415 if (ID < REGUNITS_END) {
1416 OS << ' ' << RelScore << ":vRU" << ID;
1417 } else {
1418 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1419 "Unhandled/unexpected ID value!");
1420 OS << ' ' << RelScore << ":LDSDMA" << ID;
1421 }
1422 }
1423
1424 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1425 if (isSmemCounter(T)) {
1426 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1427 sort(SortedSMEMIDs);
1428 for (auto ID : SortedSMEMIDs) {
1429 unsigned RegScore = SGPRs.at(ID).get(T);
1430 if (RegScore <= LB)
1431 continue;
1432 unsigned RelScore = RegScore - LB - 1;
1433 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1434 }
1435 }
1436
1437 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1438 OS << ' ' << SCCScore << ":scc";
1439 }
1440 OS << '\n';
1441 }
1442
1443 OS << "Pending Events: ";
1444 if (hasPendingEvent()) {
1445 ListSeparator LS;
1446 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1447 if (hasPendingEvent((WaitEventType)I)) {
1448 OS << LS << WaitEventTypeName[I];
1449 }
1450 }
1451 } else {
1452 OS << "none";
1453 }
1454 OS << '\n';
1455
1456 OS << "Async score: ";
1457 if (AsyncScore.empty())
1458 OS << "none";
1459 else
1460 llvm::interleaveComma(AsyncScore, OS);
1461 OS << '\n';
1462
1463 OS << "Async marks: " << AsyncMarks.size() << '\n';
1464
1465 for (const auto &Mark : AsyncMarks) {
1466 for (auto T : AMDGPU::inst_counter_types()) {
1467 unsigned MarkedScore = Mark[T];
1468 switch (T) {
1469 case AMDGPU::LOAD_CNT:
1470 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1471 << "_CNT: " << MarkedScore;
1472 break;
1473 case AMDGPU::DS_CNT:
1474 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1475 << "_CNT: " << MarkedScore;
1476 break;
1477 case AMDGPU::EXP_CNT:
1478 OS << " EXP_CNT: " << MarkedScore;
1479 break;
1480 case AMDGPU::STORE_CNT:
1481 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1482 << "_CNT: " << MarkedScore;
1483 break;
1484 case AMDGPU::SAMPLE_CNT:
1485 OS << " SAMPLE_CNT: " << MarkedScore;
1486 break;
1487 case AMDGPU::BVH_CNT:
1488 OS << " BVH_CNT: " << MarkedScore;
1489 break;
1490 case AMDGPU::KM_CNT:
1491 OS << " KM_CNT: " << MarkedScore;
1492 break;
1493 case AMDGPU::X_CNT:
1494 OS << " X_CNT: " << MarkedScore;
1495 break;
1496 case AMDGPU::ASYNC_CNT:
1497 OS << " ASYNC_CNT: " << MarkedScore;
1498 break;
1499 default:
1500 OS << " UNKNOWN: " << MarkedScore;
1501 break;
1502 }
1503 }
1504 OS << '\n';
1505 }
1506 OS << '\n';
1507}
1508
1509/// Simplify \p UpdateWait by removing waits that are redundant based on the
1510/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1511void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1512 AMDGPU::Waitcnt &UpdateWait) const {
1513 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1514 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1515 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1516 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1517 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1518 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1519 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1520 simplifyXcnt(CheckWait, UpdateWait);
1521 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1522 simplifyVmVsrc(CheckWait, UpdateWait);
1523 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1524}
1525
1526void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1527 unsigned &Count) const {
1528 // The number of outstanding events for this type, T, can be calculated
1529 // as (UB - LB). If the current Count is greater than or equal to the number
1530 // of outstanding events, then the wait for this counter is redundant.
1531 if (Count >= getScoreRange(T))
1532 Count = ~0u;
1533}
1534
1535void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1536 AMDGPU::InstCounterType T) const {
1537 unsigned Cnt = Wait.get(T);
1538 simplifyWaitcnt(T, Cnt);
1539 Wait.set(T, Cnt);
1540}
1541
1542void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1543 AMDGPU::Waitcnt &UpdateWait) const {
1544 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1545 // optimizations. On entry to a block with multiple predescessors, there may
1546 // be pending SMEM and VMEM events active at the same time.
1547 // In such cases, only clear one active event at a time.
1548 // TODO: Revisit xcnt optimizations for gfx1250.
1549 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1550 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1551 // zero.
1552 if (CheckWait.get(AMDGPU::KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1553 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1554 // If we have pending store we cannot optimize XCnt because we do not wait for
1555 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1556 // decremented to the same number as LOADCnt.
1557 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1558 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1559 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1560 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1561 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1562}
1563
1564void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1565 AMDGPU::Waitcnt &UpdateWait) const {
1566 // Waiting for some counters implies waiting for VM_VSRC, since an
1567 // instruction that decrements a counter on completion would have
1568 // decremented VM_VSRC once its VGPR operands had been read.
1569 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1570 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1571 CheckWait.get(AMDGPU::STORE_CNT),
1572 CheckWait.get(AMDGPU::SAMPLE_CNT),
1573 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1574 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1575 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1576}
1577
1578void WaitcntBrackets::purgeEmptyTrackingData() {
1579 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1580 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1581}
1582
1583void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1584 unsigned ScoreToWait,
1585 AMDGPU::Waitcnt &Wait) const {
1586 const unsigned LB = getScoreLB(T);
1587 const unsigned UB = getScoreUB(T);
1588
1589 // If the score falls within the bracket, we need a waitcnt.
1590 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1591 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1592 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1593 // If there is a pending FLAT operation, and this is a VMem or LGKM
1594 // waitcnt and the target can report early completion, then we need
1595 // to force a waitcnt 0.
1596 addWait(Wait, T, 0);
1597 } else if (counterOutOfOrder(T)) {
1598 // Counter can get decremented out-of-order when there
1599 // are multiple types event in the bracket. Also emit an s_wait counter
1600 // with a conservative value of 0 for the counter.
1601 addWait(Wait, T, 0);
1602 } else {
1603 // If a counter has been maxed out avoid overflow by waiting for
1604 // MAX(CounterType) - 1 instead.
1605 unsigned NeededWait = std::min(
1606 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1607 addWait(Wait, T, NeededWait);
1608 }
1609 }
1610}
1611
1612AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1613 LLVM_DEBUG({
1614 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1615 << ":\n";
1616 for (const auto &Mark : AsyncMarks) {
1617 llvm::interleaveComma(Mark, dbgs());
1618 dbgs() << '\n';
1619 }
1620 });
1621
1622 if (AsyncMarks.size() == MaxAsyncMarks) {
1623 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1624 // MaxAsyncMarks is linear when traversing straightline code. But we do
1625 // need to check if truncation may have occured at a merge, and adjust N
1626 // to ensure that a wait is generated.
1627 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1628 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1629 }
1630
1631 AMDGPU::Waitcnt Wait;
1632 if (AsyncMarks.size() <= N) {
1633 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1634 return Wait;
1635 }
1636
1637 size_t MarkIndex = AsyncMarks.size() - N - 1;
1638 const auto &RequiredMark = AsyncMarks[MarkIndex];
1640 determineWaitForScore(T, RequiredMark[T], Wait);
1641
1642 // Immediately remove the waited mark and all older ones
1643 // This happens BEFORE the wait is actually inserted, which is fine
1644 // because we've already extracted the wait requirements
1645 LLVM_DEBUG({
1646 dbgs() << "Removing " << (MarkIndex + 1)
1647 << " async marks after determining wait\n";
1648 });
1649 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1650
1651 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1652 return Wait;
1653}
1654
1655// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1656// other half 16bit.
1657//
1658// Replace VGPR16 to VGPR32 for wait check if:
1659// 1. MI is a VALU, and there is a wait event on the other half
1660// 2. MI is a LdSt, and there is a wait event on the other half from different
1661// order group
1662MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1664 MCPhysReg Reg) const {
1665 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1666 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1667
1668 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1669 return Reg;
1670
1671 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1672 // check dependency on the other half
1673 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1674 Register OtherHalf = Context->TRI.getSubReg(
1675 Reg32,
1676 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1677
1678 AMDGPU::Waitcnt Wait;
1679 for (MCRegUnit RU : regunits(OtherHalf))
1680 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1681
1682 // No wait on otherhalf
1683 if (!Wait.hasWait())
1684 return Reg;
1685
1686 if (Context->TII.isVALU(MI))
1687 return Reg32;
1688
1689 // If hi/lo16 mixed events
1690 WaitEventSet MIEvents = Context->getEventsFor(MI);
1691 WaitEventSet OtherHalfEvents = Context->getWaitEvents(T);
1692 WaitEventSet Events = MIEvents & OtherHalfEvents;
1693 if (Events.twoOrMore())
1694 return Reg32;
1695 return Reg;
1696}
1697
1698void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1699 MCPhysReg Reg,
1700 AMDGPU::Waitcnt &Wait,
1701 const MachineInstr &MI) const {
1702 if (Reg == AMDGPU::SCC) {
1703 determineWaitForScore(T, SCCScore, Wait);
1704 } else {
1705 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1706 if (IsVGPR)
1707 Reg = determineVGPR16Dependency(MI, T, Reg);
1708 for (MCRegUnit RU : regunits(Reg))
1709 determineWaitForScore(
1710 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1711 Wait);
1712 }
1713}
1714
1715void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1716 VMEMID TID,
1717 AMDGPU::Waitcnt &Wait) const {
1718 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1719 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1720}
1721
1722void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1723 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1724 // SCC has landed
1725 if (PendingSCCWrite &&
1726 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1727 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1728 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1729 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1730 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1731 SCC_WRITE_PendingEvent) {
1732 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1733 }
1734
1735 PendingEvents.remove(SCC_WRITE_PendingEvent);
1736 PendingSCCWrite = nullptr;
1737 }
1738}
1739
1740void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1742 applyWaitcnt(Wait, T);
1743}
1744
1745void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1746 const unsigned UB = getScoreUB(T);
1747 if (Count >= UB)
1748 return;
1749 if (Count != 0) {
1750 if (counterOutOfOrder(T))
1751 return;
1752 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1753 } else {
1754 setScoreLB(T, UB);
1755 PendingEvents.remove(Context->getWaitEvents(T));
1756 }
1757
1758 if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1759 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1760 applyWaitcnt(AMDGPU::X_CNT, 0);
1761 else
1762 PendingEvents.remove(SMEM_GROUP);
1763 }
1764 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1765 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1766 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1767 applyWaitcnt(AMDGPU::X_CNT, Count);
1768 else if (Count == 0)
1769 PendingEvents.remove(VMEM_GROUP);
1770 }
1771}
1772
1773void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1775 unsigned Cnt = Wait.get(T);
1776 applyWaitcnt(T, Cnt);
1777}
1778
1779// Where there are multiple types of event in the bracket of a counter,
1780// the decrement may go out of order.
1781bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1782 // Scalar memory read always can go out of order.
1783 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1784 (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
1785 return true;
1786
1787 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1788 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1789 // out-of-order completion.
1790 if (T == AMDGPU::LOAD_CNT) {
1791 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1792 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1793 // events
1794 Events.remove(GLOBAL_INV_ACCESS);
1795 // Return true only if there are still multiple event types after removing
1796 // GLOBAL_INV
1797 return Events.twoOrMore();
1798 }
1799
1800 return hasMixedPendingEvents(T);
1801}
1802
1803INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1804 false, false)
1807INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1809
1810char SIInsertWaitcntsLegacy::ID = 0;
1811
1812char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1813
1815 return new SIInsertWaitcntsLegacy();
1816}
1817
1818static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1819 unsigned NewEnc) {
1820 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1821 assert(OpIdx >= 0);
1822
1823 MachineOperand &MO = MI.getOperand(OpIdx);
1824
1825 if (NewEnc == MO.getImm())
1826 return false;
1827
1828 MO.setImm(NewEnc);
1829 return true;
1830}
1831
1832/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1833/// and if so, which counter it is waiting on.
1834static std::optional<AMDGPU::InstCounterType>
1835counterTypeForInstr(unsigned Opcode) {
1836 switch (Opcode) {
1837 case AMDGPU::S_WAIT_LOADCNT:
1838 return AMDGPU::LOAD_CNT;
1839 case AMDGPU::S_WAIT_EXPCNT:
1840 return AMDGPU::EXP_CNT;
1841 case AMDGPU::S_WAIT_STORECNT:
1842 return AMDGPU::STORE_CNT;
1843 case AMDGPU::S_WAIT_SAMPLECNT:
1844 return AMDGPU::SAMPLE_CNT;
1845 case AMDGPU::S_WAIT_BVHCNT:
1846 return AMDGPU::BVH_CNT;
1847 case AMDGPU::S_WAIT_DSCNT:
1848 return AMDGPU::DS_CNT;
1849 case AMDGPU::S_WAIT_KMCNT:
1850 return AMDGPU::KM_CNT;
1851 case AMDGPU::S_WAIT_XCNT:
1852 return AMDGPU::X_CNT;
1853 case AMDGPU::S_WAIT_ASYNCCNT:
1854 return AMDGPU::ASYNC_CNT;
1855 case AMDGPU::S_WAIT_TENSORCNT:
1856 return AMDGPU::TENSOR_CNT;
1857 default:
1858 return {};
1859 }
1860}
1861
1862bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1863 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1864 if (Opcode == Waitcnt->getOpcode())
1865 return false;
1866
1867 Waitcnt->setDesc(TII.get(Opcode));
1868 return true;
1869}
1870
1871/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1872/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1873/// from \p Wait that were added by previous passes. Currently this pass
1874/// conservatively assumes that these preexisting waits are required for
1875/// correctness.
1876bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1877 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1878 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1879 assert(isNormalMode(MaxCounter));
1880
1881 bool Modified = false;
1882 MachineInstr *WaitcntInstr = nullptr;
1883 MachineInstr *WaitcntVsCntInstr = nullptr;
1884
1885 LLVM_DEBUG({
1886 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1887 if (It.isEnd())
1888 dbgs() << "end of block\n";
1889 else
1890 dbgs() << *It;
1891 });
1892
1893 for (auto &II :
1894 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1895 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1896 if (isNonWaitcntMetaInst(II)) {
1897 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1898 continue;
1899 }
1900
1901 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1902 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1903
1904 // Update required wait count. If this is a soft waitcnt (= it was added
1905 // by an earlier pass), it may be entirely removed.
1906 if (Opcode == AMDGPU::S_WAITCNT) {
1907 unsigned IEnc = II.getOperand(0).getImm();
1908 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1909 if (TrySimplify)
1910 ScoreBrackets.simplifyWaitcnt(OldWait);
1911 Wait = Wait.combined(OldWait);
1912
1913 // Merge consecutive waitcnt of the same type by erasing multiples.
1914 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1915 II.eraseFromParent();
1916 Modified = true;
1917 } else
1918 WaitcntInstr = &II;
1919 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1920 assert(ST.hasVMemToLDSLoad());
1921 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1922 << "Before: " << Wait << '\n';);
1923 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1924 Wait);
1925 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1926
1927 // It is possible (but unlikely) that this is the only wait instruction,
1928 // in which case, we exit this loop without a WaitcntInstr to consume
1929 // `Wait`. But that works because `Wait` was passed in by reference, and
1930 // the callee eventually calls createNewWaitcnt on it. We test this
1931 // possibility in an articial MIR test since such a situation cannot be
1932 // recreated by running the memory legalizer.
1933 II.eraseFromParent();
1934 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1935 unsigned N = II.getOperand(0).getImm();
1936 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1937 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1938 Wait = Wait.combined(OldWait);
1939 } else {
1940 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1941 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1942
1943 unsigned OldVSCnt =
1944 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1945 if (TrySimplify)
1946 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1948 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1949
1950 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1951 II.eraseFromParent();
1952 Modified = true;
1953 } else
1954 WaitcntVsCntInstr = &II;
1955 }
1956 }
1957
1958 if (WaitcntInstr) {
1959 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1961 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1962
1963 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1964 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1965 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1966 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1967 Wait.set(AMDGPU::EXP_CNT, ~0u);
1968 Wait.set(AMDGPU::DS_CNT, ~0u);
1969
1970 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1971 << "New Instr at block end: "
1972 << *WaitcntInstr << '\n'
1973 : dbgs() << "applied pre-existing waitcnt\n"
1974 << "Old Instr: " << *It
1975 << "New Instr: " << *WaitcntInstr << '\n');
1976 }
1977
1978 if (WaitcntVsCntInstr) {
1979 Modified |=
1980 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1981 Wait.get(AMDGPU::STORE_CNT));
1982 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1983
1984 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1985 Wait.set(AMDGPU::STORE_CNT, ~0u);
1986
1987 LLVM_DEBUG(It.isEnd()
1988 ? dbgs() << "applied pre-existing waitcnt\n"
1989 << "New Instr at block end: " << *WaitcntVsCntInstr
1990 << '\n'
1991 : dbgs() << "applied pre-existing waitcnt\n"
1992 << "Old Instr: " << *It
1993 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1994 }
1995
1996 return Modified;
1997}
1998
1999/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
2000/// required counters in \p Wait
2001bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
2002 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2003 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2004 assert(isNormalMode(MaxCounter));
2005
2006 bool Modified = false;
2007 const DebugLoc &DL = Block.findDebugLoc(It);
2008
2009 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
2010 // single instruction while VScnt has its own instruction.
2011 if (Wait.hasWaitExceptStoreCnt()) {
2012 // If profiling expansion is enabled, emit an expanded sequence
2013 if (ExpandWaitcntProfiling) {
2014 // Check if any of the counters to be waited on are out-of-order.
2015 // If so, fall back to normal (non-expanded) behavior since expansion
2016 // would provide misleading profiling information.
2017 bool AnyOutOfOrder = false;
2018 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2019 unsigned WaitCnt = Wait.get(CT);
2020 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
2021 AnyOutOfOrder = true;
2022 break;
2023 }
2024 }
2025
2026 if (AnyOutOfOrder) {
2027 // Fall back to non-expanded wait
2028 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2029 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2030 Modified = true;
2031 } else {
2032 // All counters are in-order, safe to expand
2033 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2034 unsigned WaitCnt = Wait.get(CT);
2035 if (WaitCnt == ~0u)
2036 continue;
2037
2038 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2039 getWaitCountMax(getLimits(), CT) - 1);
2040 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
2041 AMDGPU::Waitcnt W;
2042 W.set(CT, Count);
2043 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
2045 });
2046 Modified = true;
2047 }
2048 }
2049 } else {
2050 // Normal behavior: emit single combined waitcnt
2051 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2052 [[maybe_unused]] auto SWaitInst =
2053 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2054 Modified = true;
2055
2056 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2057 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2058 dbgs() << "New Instr: " << *SWaitInst << '\n');
2059 }
2060 }
2061
2062 if (Wait.hasWaitStoreCnt()) {
2063 assert(ST.hasVscnt());
2064
2065 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
2066 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
2067 // Only expand if counter is not out-of-order
2068 unsigned Outstanding =
2069 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
2070 getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
2071 EmitExpandedWaitcnt(
2072 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
2073 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2074 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2075 .addImm(Count);
2076 });
2077 Modified = true;
2078 } else {
2079 [[maybe_unused]] auto SWaitInst =
2080 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2081 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2083 Modified = true;
2084
2085 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2086 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2087 dbgs() << "New Instr: " << *SWaitInst << '\n');
2088 }
2089 }
2090
2091 return Modified;
2092}
2093
2094AMDGPU::Waitcnt
2095WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2096 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
2097}
2098
2099AMDGPU::Waitcnt
2100WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2101 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
2102 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2103 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
2104 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
2105}
2106
2107/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2108/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2109/// were added by previous passes. Currently this pass conservatively
2110/// assumes that these preexisting waits are required for correctness.
2111bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2112 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2113 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2114 assert(!isNormalMode(MaxCounter));
2115
2116 bool Modified = false;
2117 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2118 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2119 MachineInstr *WaitcntDepctrInstr = nullptr;
2120 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
2121
2122 LLVM_DEBUG({
2123 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2124 if (It.isEnd())
2125 dbgs() << "end of block\n";
2126 else
2127 dbgs() << *It;
2128 });
2129
2130 // Accumulate waits that should not be simplified.
2131 AMDGPU::Waitcnt RequiredWait;
2132
2133 for (auto &II :
2134 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2135 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2136 if (isNonWaitcntMetaInst(II)) {
2137 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2138 continue;
2139 }
2140
2141 // Update required wait count. If this is a soft waitcnt (= it was added
2142 // by an earlier pass), it may be entirely removed.
2143
2144 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2145 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2146
2147 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2148 // attempt to do more than that either.
2149 if (Opcode == AMDGPU::S_WAITCNT)
2150 continue;
2151
2152 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2153 unsigned OldEnc =
2154 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2155 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2156 if (TrySimplify)
2157 Wait = Wait.combined(OldWait);
2158 else
2159 RequiredWait = RequiredWait.combined(OldWait);
2160 // Keep the first wait_loadcnt, erase the rest.
2161 if (CombinedLoadDsCntInstr == nullptr) {
2162 CombinedLoadDsCntInstr = &II;
2163 } else {
2164 II.eraseFromParent();
2165 Modified = true;
2166 }
2167 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2168 unsigned OldEnc =
2169 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2170 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2171 if (TrySimplify)
2172 Wait = Wait.combined(OldWait);
2173 else
2174 RequiredWait = RequiredWait.combined(OldWait);
2175 // Keep the first wait_storecnt, erase the rest.
2176 if (CombinedStoreDsCntInstr == nullptr) {
2177 CombinedStoreDsCntInstr = &II;
2178 } else {
2179 II.eraseFromParent();
2180 Modified = true;
2181 }
2182 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2183 unsigned OldEnc =
2184 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2185 AMDGPU::Waitcnt OldWait;
2188 if (TrySimplify)
2189 ScoreBrackets.simplifyWaitcnt(OldWait);
2190 Wait = Wait.combined(OldWait);
2191 if (WaitcntDepctrInstr == nullptr) {
2192 WaitcntDepctrInstr = &II;
2193 } else {
2194 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2195 // duplicate if it is waiting on things other than VA_VDST or
2196 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2197 // VM_VSRC subfields of the operand are set to the "no wait"
2198 // values.
2199
2200 unsigned Enc =
2201 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2202 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2203 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2204
2205 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2206 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2207 Modified |= promoteSoftWaitCnt(&II);
2208 } else {
2209 II.eraseFromParent();
2210 Modified = true;
2211 }
2212 }
2213 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2214 // Architectures higher than GFX10 do not have direct loads to
2215 // LDS, so no work required here yet.
2216 II.eraseFromParent();
2217 Modified = true;
2218 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2219 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
2220 // shows up in the assembly as a comment with the original parameter N.
2221 unsigned N = II.getOperand(0).getImm();
2222 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
2223 Wait = Wait.combined(OldWait);
2224 } else {
2225 std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
2226 assert(CT.has_value());
2227 unsigned OldCnt =
2228 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2229 if (TrySimplify)
2230 addWait(Wait, CT.value(), OldCnt);
2231 else
2232 addWait(RequiredWait, CT.value(), OldCnt);
2233 // Keep the first wait of its kind, erase the rest.
2234 if (WaitInstrs[CT.value()] == nullptr) {
2235 WaitInstrs[CT.value()] = &II;
2236 } else {
2237 II.eraseFromParent();
2238 Modified = true;
2239 }
2240 }
2241 }
2242
2243 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2244 Wait = Wait.combined(RequiredWait);
2245
2246 if (CombinedLoadDsCntInstr) {
2247 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2248 // to be waited for. Otherwise, let the instruction be deleted so
2249 // the appropriate single counter wait instruction can be inserted
2250 // instead, when new S_WAIT_*CNT instructions are inserted by
2251 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2252 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2253 // the loop below that deals with single counter instructions.
2254 //
2255 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2256 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2257 // will have needed to wait for their register sources to be available
2258 // first.
2259 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2260 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2261 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2262 AMDGPU::OpName::simm16, NewEnc);
2263 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2264 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
2265 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
2266 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2267 Wait.set(AMDGPU::DS_CNT, ~0u);
2268
2269 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2270 << "New Instr at block end: "
2271 << *CombinedLoadDsCntInstr << '\n'
2272 : dbgs() << "applied pre-existing waitcnt\n"
2273 << "Old Instr: " << *It << "New Instr: "
2274 << *CombinedLoadDsCntInstr << '\n');
2275 } else {
2276 CombinedLoadDsCntInstr->eraseFromParent();
2277 Modified = true;
2278 }
2279 }
2280
2281 if (CombinedStoreDsCntInstr) {
2282 // Similarly for S_WAIT_STORECNT_DSCNT.
2283 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2284 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2285 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2286 AMDGPU::OpName::simm16, NewEnc);
2287 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2288 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2289 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2290 Wait.set(AMDGPU::STORE_CNT, ~0u);
2291 Wait.set(AMDGPU::DS_CNT, ~0u);
2292
2293 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2294 << "New Instr at block end: "
2295 << *CombinedStoreDsCntInstr << '\n'
2296 : dbgs() << "applied pre-existing waitcnt\n"
2297 << "Old Instr: " << *It << "New Instr: "
2298 << *CombinedStoreDsCntInstr << '\n');
2299 } else {
2300 CombinedStoreDsCntInstr->eraseFromParent();
2301 Modified = true;
2302 }
2303 }
2304
2305 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2306 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2307 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2308 // instructions so that createNewWaitcnt() will create new combined
2309 // instructions to replace them.
2310
2311 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2312 // This is a vector of addresses in WaitInstrs pointing to instructions
2313 // that should be removed if they are present.
2315
2316 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2317 // both) need to be waited for, ensure that there are no existing
2318 // individual wait count instructions for these.
2319
2320 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2321 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2322 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2323 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2324 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2325 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2326 }
2327
2328 for (MachineInstr **WI : WaitsToErase) {
2329 if (!*WI)
2330 continue;
2331
2332 (*WI)->eraseFromParent();
2333 *WI = nullptr;
2334 Modified = true;
2335 }
2336 }
2337
2339 if (!WaitInstrs[CT])
2340 continue;
2341
2342 unsigned NewCnt = Wait.get(CT);
2343 if (NewCnt != ~0u) {
2344 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2345 AMDGPU::OpName::simm16, NewCnt);
2346 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2347
2348 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2349 setNoWait(Wait, CT);
2350
2351 LLVM_DEBUG(It.isEnd()
2352 ? dbgs() << "applied pre-existing waitcnt\n"
2353 << "New Instr at block end: " << *WaitInstrs[CT]
2354 << '\n'
2355 : dbgs() << "applied pre-existing waitcnt\n"
2356 << "Old Instr: " << *It
2357 << "New Instr: " << *WaitInstrs[CT] << '\n');
2358 } else {
2359 WaitInstrs[CT]->eraseFromParent();
2360 Modified = true;
2361 }
2362 }
2363
2364 if (WaitcntDepctrInstr) {
2365 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2366 // subfields with the new required values.
2367 unsigned Enc =
2368 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2369 ->getImm();
2372
2373 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2374 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2375 Wait.set(AMDGPU::VA_VDST, ~0u);
2376 Wait.set(AMDGPU::VM_VSRC, ~0u);
2377
2378 // If that new encoded Depctr immediate would actually still wait
2379 // for anything, update the instruction's operand. Otherwise it can
2380 // just be deleted.
2381 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2382 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2383 AMDGPU::OpName::simm16, Enc);
2384 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2385 << "New Instr at block end: "
2386 << *WaitcntDepctrInstr << '\n'
2387 : dbgs() << "applyPreexistingWaitcnt\n"
2388 << "Old Instr: " << *It << "New Instr: "
2389 << *WaitcntDepctrInstr << '\n');
2390 } else {
2391 WaitcntDepctrInstr->eraseFromParent();
2392 Modified = true;
2393 }
2394 }
2395
2396 return Modified;
2397}
2398
2399/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2400bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2401 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2402 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2403 assert(!isNormalMode(MaxCounter));
2404
2405 bool Modified = false;
2406 const DebugLoc &DL = Block.findDebugLoc(It);
2407
2408 // For GFX12+, we use separate wait instructions, which makes expansion
2409 // simpler
2410 if (ExpandWaitcntProfiling) {
2412 unsigned Count = Wait.get(CT);
2413 if (Count == ~0u)
2414 continue;
2415
2416 // Skip expansion for out-of-order counters - emit normal wait instead
2417 if (ScoreBrackets.counterOutOfOrder(CT)) {
2418 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2419 .addImm(Count);
2420 Modified = true;
2421 continue;
2422 }
2423
2424 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2425 getWaitCountMax(getLimits(), CT) - 1);
2426 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2427 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2428 .addImm(Val);
2429 });
2430 Modified = true;
2431 }
2432 return Modified;
2433 }
2434
2435 // Normal behavior (no expansion)
2436 // Check for opportunities to use combined wait instructions.
2437 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2438 MachineInstr *SWaitInst = nullptr;
2439
2440 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2441 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2442
2443 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2444 .addImm(Enc);
2445
2446 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2447 Wait.set(AMDGPU::DS_CNT, ~0u);
2448 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2449 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2450
2451 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2452 .addImm(Enc);
2453
2454 Wait.set(AMDGPU::STORE_CNT, ~0u);
2455 Wait.set(AMDGPU::DS_CNT, ~0u);
2456 }
2457
2458 if (SWaitInst) {
2459 Modified = true;
2460
2461 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2462 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2463 dbgs() << "New Instr: " << *SWaitInst << '\n');
2464 }
2465 }
2466
2467 // Generate an instruction for any remaining counter that needs
2468 // waiting for.
2469
2471 unsigned Count = Wait.get(CT);
2472 if (Count == ~0u)
2473 continue;
2474
2475 [[maybe_unused]] auto SWaitInst =
2476 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2477 .addImm(Count);
2478
2479 Modified = true;
2480
2481 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2482 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2483 dbgs() << "New Instr: " << *SWaitInst << '\n');
2484 }
2485
2486 if (Wait.hasWaitDepctr()) {
2487 assert(IsExpertMode);
2488 unsigned Enc =
2491
2492 [[maybe_unused]] auto SWaitInst =
2493 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2494
2495 Modified = true;
2496
2497 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2498 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2499 dbgs() << "New Instr: " << *SWaitInst << '\n');
2500 }
2501
2502 return Modified;
2503}
2504
2505/// Generate s_waitcnt instruction to be placed before cur_Inst.
2506/// Instructions of a given type are returned in order,
2507/// but instructions of different types can complete out of order.
2508/// We rely on this in-order completion
2509/// and simply assign a score to the memory access instructions.
2510/// We keep track of the active "score bracket" to determine
2511/// if an access of a memory read requires an s_waitcnt
2512/// and if so what the value of each counter is.
2513/// The "score bracket" is bound by the lower bound and upper bound
2514/// scores (*_score_LB and *_score_ub respectively).
2515/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2516/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2517/// (GFX12+ only, where DS_CNT is a separate counter).
2518bool SIInsertWaitcnts::generateWaitcntInstBefore(
2519 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2520 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2521 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2522 setForceEmitWaitcnt();
2523
2524 assert(!isNonWaitcntMetaInst(MI));
2525
2526 AMDGPU::Waitcnt Wait;
2527 const unsigned Opc = MI.getOpcode();
2528
2529 switch (Opc) {
2530 case AMDGPU::BUFFER_WBINVL1:
2531 case AMDGPU::BUFFER_WBINVL1_SC:
2532 case AMDGPU::BUFFER_WBINVL1_VOL:
2533 case AMDGPU::BUFFER_GL0_INV:
2534 case AMDGPU::BUFFER_GL1_INV: {
2535 // FIXME: This should have already been handled by the memory legalizer.
2536 // Removing this currently doesn't affect any lit tests, but we need to
2537 // verify that nothing was relying on this. The number of buffer invalidates
2538 // being handled here should not be expanded.
2539 Wait.set(AMDGPU::LOAD_CNT, 0);
2540 break;
2541 }
2542 case AMDGPU::SI_RETURN_TO_EPILOG:
2543 case AMDGPU::SI_RETURN:
2544 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2545 case AMDGPU::S_SETPC_B64_return: {
2546 // All waits must be resolved at call return.
2547 // NOTE: this could be improved with knowledge of all call sites or
2548 // with knowledge of the called routines.
2549 ReturnInsts.insert(&MI);
2550 AMDGPU::Waitcnt AllZeroWait =
2551 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2552 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2553 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2554 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2555 // no need to wait for it at function boundaries.
2556 if (ST.hasExtendedWaitCounts() &&
2557 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2558 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2559 Wait = AllZeroWait;
2560 break;
2561 }
2562 case AMDGPU::S_ENDPGM:
2563 case AMDGPU::S_ENDPGM_SAVED: {
2564 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2565 // Technically the hardware will do this on its own if we don't, but that
2566 // might cost extra cycles compared to doing it explicitly.
2567 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2568 // have to wait for outstanding VMEM stores. In this case it can be useful
2569 // to send a message to explicitly release all VGPRs before the stores have
2570 // completed, but it is only safe to do this if there are no outstanding
2571 // scratch stores.
2572 EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2573 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2574 break;
2575 }
2576 case AMDGPU::S_SENDMSG:
2577 case AMDGPU::S_SENDMSGHALT: {
2578 if (ST.hasLegacyGeometry() &&
2579 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2581 // Resolve vm waits before gs-done.
2582 Wait.set(AMDGPU::LOAD_CNT, 0);
2583 break;
2584 }
2585 [[fallthrough]];
2586 }
2587 default: {
2588
2589 // Export & GDS instructions do not read the EXEC mask until after the
2590 // export is granted (which can occur well after the instruction is issued).
2591 // The shader program must flush all EXP operations on the export-count
2592 // before overwriting the EXEC mask.
2593 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2594 // Export and GDS are tracked individually, either may trigger a waitcnt
2595 // for EXEC.
2596 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2597 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2598 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2599 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2600 Wait.set(AMDGPU::EXP_CNT, 0);
2601 }
2602 }
2603
2604 // Wait for any pending GDS instruction to complete before any
2605 // "Always GDS" instruction.
2606 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2607 addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2608
2609 if (MI.isCall()) {
2610 // The function is going to insert a wait on everything in its prolog.
2611 // This still needs to be careful if the call target is a load (e.g. a GOT
2612 // load). We also need to check WAW dependency with saved PC.
2613 CallInsts.insert(&MI);
2614 Wait = AMDGPU::Waitcnt();
2615
2616 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2617 if (CallAddrOp.isReg()) {
2618 ScoreBrackets.determineWaitForPhysReg(
2619 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2620
2621 if (const auto *RtnAddrOp =
2622 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2623 ScoreBrackets.determineWaitForPhysReg(
2624 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2625 }
2626 }
2627 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2628 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2629 } else {
2630 // FIXME: Should not be relying on memoperands.
2631 // Look at the source operands of every instruction to see if
2632 // any of them results from a previous memory operation that affects
2633 // its current usage. If so, an s_waitcnt instruction needs to be
2634 // emitted.
2635 // If the source operand was defined by a load, add the s_waitcnt
2636 // instruction.
2637 //
2638 // Two cases are handled for destination operands:
2639 // 1) If the destination operand was defined by a load, add the s_waitcnt
2640 // instruction to guarantee the right WAW order.
2641 // 2) If a destination operand that was used by a recent export/store ins,
2642 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2643
2644 for (const MachineMemOperand *Memop : MI.memoperands()) {
2645 const Value *Ptr = Memop->getValue();
2646 if (Memop->isStore()) {
2647 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2648 addWait(Wait, SmemAccessCounter, 0);
2649 if (PDT.dominates(MI.getParent(), It->second))
2650 SLoadAddresses.erase(It);
2651 }
2652 }
2653 unsigned AS = Memop->getAddrSpace();
2655 continue;
2656 // No need to wait before load from VMEM to LDS.
2657 if (TII.mayWriteLDSThroughDMA(MI))
2658 continue;
2659
2660 // LOAD_CNT is only relevant to vgpr or LDS.
2661 unsigned TID = LDSDMA_BEGIN;
2662 if (Ptr && Memop->getAAInfo()) {
2663 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2664 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2665 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2666 if ((I + 1) >= NUM_LDSDMA) {
2667 // We didn't have enough slot to track this LDS DMA store, it
2668 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2669 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2670 Wait);
2671 break;
2672 }
2673
2674 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2675 TID + I + 1, Wait);
2676 }
2677 }
2678 } else {
2679 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2680 }
2681 if (Memop->isStore()) {
2682 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2683 }
2684 }
2685
2686 // Loop over use and def operands.
2687 for (const MachineOperand &Op : MI.operands()) {
2688 if (!Op.isReg())
2689 continue;
2690
2691 // If the instruction does not read tied source, skip the operand.
2692 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2693 continue;
2694
2695 MCPhysReg Reg = Op.getReg().asMCReg();
2696
2697 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2698 if (IsVGPR) {
2699 // Implicit VGPR defs and uses are never a part of the memory
2700 // instructions description and usually present to account for
2701 // super-register liveness.
2702 // TODO: Most of the other instructions also have implicit uses
2703 // for the liveness accounting only.
2704 if (Op.isImplicit() && MI.mayLoadOrStore())
2705 continue;
2706
2707 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2708 if (Op.isDef())
2709 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2710 MI);
2711 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2712 // previous write and this write are the same type of VMEM
2713 // instruction, in which case they are (in some architectures)
2714 // guaranteed to write their results in order anyway.
2715 // Additionally check instructions where Point Sample Acceleration
2716 // might be applied.
2717 if (Op.isUse() || !updateVMCntOnly(MI) ||
2718 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2719 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2720 !ST.hasVmemWriteVgprInOrder()) {
2721 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2722 MI);
2723 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2724 MI);
2725 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2726 MI);
2727 ScoreBrackets.clearVgprVmemTypes(Reg);
2728 }
2729
2730 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2731 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2732 MI);
2733 }
2734 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2735 } else if (Op.getReg() == AMDGPU::SCC) {
2736 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2737 } else {
2738 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2739 MI);
2740 }
2741
2742 if (ST.hasWaitXcnt() && Op.isDef())
2743 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2744 }
2745 }
2746 }
2747 }
2748
2749 // Ensure safety against exceptions from outstanding memory operations while
2750 // waiting for a barrier:
2751 //
2752 // * Some subtargets safely handle backing off the barrier in hardware
2753 // when an exception occurs.
2754 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2755 // there can be no outstanding memory operations during the wait.
2756 // * Subtargets with split barriers don't need to back off the barrier; it
2757 // is up to the trap handler to preserve the user barrier state correctly.
2758 //
2759 // In all other cases, ensure safety by ensuring that there are no outstanding
2760 // memory operations.
2761 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2762 !ST.hasBackOffBarrier()) {
2763 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2764 }
2765
2766 // TODO: Remove this work-around, enable the assert for Bug 457939
2767 // after fixing the scheduler. Also, the Shader Compiler code is
2768 // independent of target.
2769 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2770 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2771 Wait.set(AMDGPU::DS_CNT, 0);
2772 }
2773
2774 // Verify that the wait is actually needed.
2775 ScoreBrackets.simplifyWaitcnt(Wait);
2776
2777 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2778 // waits on VA_VDST if the instruction it would precede is not a VALU
2779 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2780 // expert scheduling mode.
2781 if (TII.isVALU(MI))
2782 Wait.set(AMDGPU::VA_VDST, ~0u);
2783
2784 // Since the translation for VMEM addresses occur in-order, we can apply the
2785 // XCnt if the current instruction is of VMEM type and has a memory
2786 // dependency with another VMEM instruction in flight.
2787 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2788 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2789 Wait.set(AMDGPU::X_CNT, ~0u);
2790 }
2791
2792 // When forcing emit, we need to skip terminators because that would break the
2793 // terminators of the MBB if we emit a waitcnt between terminators.
2794 if (ForceEmitZeroFlag && !MI.isTerminator())
2795 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2796
2797 // If we force waitcnt then update Wait accordingly.
2799 if (!ForceEmitWaitcnt[T])
2800 continue;
2801 Wait.set(T, 0);
2802 }
2803
2804 if (FlushFlags.FlushVmCnt) {
2807 Wait.set(T, 0);
2808 }
2809
2810 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2811 Wait.set(AMDGPU::DS_CNT, 0);
2812
2813 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2814 Wait.set(AMDGPU::LOAD_CNT, 0);
2815
2816 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2817 OldWaitcntInstr);
2818}
2819
2820bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2822 MachineBasicBlock &Block,
2823 WaitcntBrackets &ScoreBrackets,
2824 MachineInstr *OldWaitcntInstr) {
2825 bool Modified = false;
2826
2827 if (OldWaitcntInstr)
2828 // Try to merge the required wait with preexisting waitcnt instructions.
2829 // Also erase redundant waitcnt.
2830 Modified =
2831 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2832
2833 // ExpCnt can be merged into VINTERP.
2834 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2836 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2837 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2838 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2839 Modified = true;
2840 }
2841 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2842 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2843 Wait.set(AMDGPU::EXP_CNT, ~0u);
2844
2845 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2846 << "Update Instr: " << *It);
2847 }
2848
2849 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2850 Modified = true;
2851
2852 // Any counts that could have been applied to any existing waitcnt
2853 // instructions will have been done so, now deal with any remaining.
2854 ScoreBrackets.applyWaitcnt(Wait);
2855
2856 return Modified;
2857}
2858
2859std::optional<WaitEventType>
2860SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2861 if (TII.isVALU(Inst)) {
2862 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2863 // out-of-order with respect to each other, so each of these classes
2864 // has its own event.
2865
2866 if (TII.isXDL(Inst))
2867 return VGPR_XDL_WRITE;
2868
2869 if (TII.isTRANS(Inst))
2870 return VGPR_TRANS_WRITE;
2871
2873 return VGPR_DPMACC_WRITE;
2874
2875 return VGPR_CSMACC_WRITE;
2876 }
2877
2878 // FLAT and LDS instructions may read their VGPR sources out-of-order
2879 // with respect to each other and all other VMEM instructions, so
2880 // each of these also has a separate event.
2881
2882 if (TII.isFLAT(Inst))
2883 return VGPR_FLAT_READ;
2884
2885 if (TII.isDS(Inst))
2886 return VGPR_LDS_READ;
2887
2888 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2889 return VGPR_VMEM_READ;
2890
2891 // Otherwise, no hazard.
2892
2893 return {};
2894}
2895
2896bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2897 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2898 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2899}
2900
2901// Return true if the next instruction is S_ENDPGM, following fallthrough
2902// blocks if necessary.
2903bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2904 MachineBasicBlock *Block) const {
2905 auto BlockEnd = Block->getParent()->end();
2906 auto BlockIter = Block->getIterator();
2907
2908 while (true) {
2909 if (It.isEnd()) {
2910 if (++BlockIter != BlockEnd) {
2911 It = BlockIter->instr_begin();
2912 continue;
2913 }
2914
2915 return false;
2916 }
2917
2918 if (!It->isMetaInstruction())
2919 break;
2920
2921 It++;
2922 }
2923
2924 assert(!It.isEnd());
2925
2926 return It->getOpcode() == AMDGPU::S_ENDPGM;
2927}
2928
2929// Add a wait after an instruction if architecture requirements mandate one.
2930bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2931 MachineBasicBlock &Block,
2932 WaitcntBrackets &ScoreBrackets) {
2933 AMDGPU::Waitcnt Wait;
2934 bool NeedsEndPGMCheck = false;
2935
2936 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2937 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2939
2940 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2941 Wait.set(AMDGPU::DS_CNT, 0);
2942 NeedsEndPGMCheck = true;
2943 }
2944
2945 ScoreBrackets.simplifyWaitcnt(Wait);
2946
2947 auto SuccessorIt = std::next(Inst.getIterator());
2948 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2949 /*OldWaitcntInstr=*/nullptr);
2950
2951 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2952 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2953 .addImm(0);
2954 }
2955
2956 return Result;
2957}
2958
2959WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2960 WaitEventSet Events;
2961 if (IsExpertMode) {
2962 if (const auto ET = getExpertSchedulingEventType(Inst))
2963 Events.insert(*ET);
2964 }
2965
2966 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2967 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2968 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2969 Events.insert(GDS_ACCESS);
2970 Events.insert(GDS_GPR_LOCK);
2971 } else {
2972 Events.insert(LDS_ACCESS);
2973 }
2974 } else if (TII.isFLAT(Inst)) {
2976 Events.insert(getVmemWaitEventType(Inst));
2977 } else {
2978 assert(Inst.mayLoadOrStore());
2979 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2980 if (ST.hasWaitXcnt())
2981 Events.insert(VMEM_GROUP);
2982 Events.insert(getVmemWaitEventType(Inst));
2983 }
2984 if (TII.mayAccessLDSThroughFlat(Inst))
2985 Events.insert(LDS_ACCESS);
2986 }
2987 } else if (SIInstrInfo::isVMEM(Inst) &&
2989 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2990 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2991 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2992 // completed.
2993 if (ST.hasWaitXcnt())
2994 Events.insert(VMEM_GROUP);
2995 Events.insert(getVmemWaitEventType(Inst));
2996 if (ST.vmemWriteNeedsExpWaitcnt() &&
2997 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2998 Events.insert(VMW_GPR_LOCK);
2999 }
3000 } else if (TII.isSMRD(Inst)) {
3001 if (ST.hasWaitXcnt())
3002 Events.insert(SMEM_GROUP);
3003 Events.insert(SMEM_ACCESS);
3004 } else if (SIInstrInfo::isLDSDIR(Inst)) {
3005 Events.insert(EXP_LDS_ACCESS);
3006 } else if (SIInstrInfo::isEXP(Inst)) {
3007 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
3009 Events.insert(EXP_PARAM_ACCESS);
3010 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
3011 Events.insert(EXP_POS_ACCESS);
3012 else
3013 Events.insert(EXP_GPR_LOCK);
3014 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
3015 Events.insert(SCC_WRITE);
3016 } else {
3017 switch (Inst.getOpcode()) {
3018 case AMDGPU::S_SENDMSG:
3019 case AMDGPU::S_SENDMSG_RTN_B32:
3020 case AMDGPU::S_SENDMSG_RTN_B64:
3021 case AMDGPU::S_SENDMSGHALT:
3022 Events.insert(SQ_MESSAGE);
3023 break;
3024 case AMDGPU::S_MEMTIME:
3025 case AMDGPU::S_MEMREALTIME:
3026 case AMDGPU::S_GET_BARRIER_STATE_M0:
3027 case AMDGPU::S_GET_BARRIER_STATE_IMM:
3028 Events.insert(SMEM_ACCESS);
3029 break;
3030 }
3031 }
3032 return Events;
3033}
3034
3035void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
3036 WaitcntBrackets *ScoreBrackets) {
3037
3038 WaitEventSet InstEvents = getEventsFor(Inst);
3039 for (WaitEventType E : wait_events()) {
3040 if (InstEvents.contains(E))
3041 ScoreBrackets->updateByEvent(E, Inst);
3042 }
3043
3044 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
3045 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
3046 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
3047 ScoreBrackets->setPendingGDS();
3048 }
3049 } else if (TII.isFLAT(Inst)) {
3050 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
3051 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
3052 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
3053 // pointers. They do have two operands that each access global and LDS,
3054 // thus making it appear at this point that they are using a flat pointer.
3055 // Filter them out, and for the rest, generate a dependency on flat
3056 // pointers so that both VM and LGKM counters are flushed.
3057 ScoreBrackets->setPendingFlat();
3058 }
3059 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
3060 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3061 }
3062 } else if (SIInstrInfo::usesTENSOR_CNT(Inst)) {
3063 ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst);
3064 } else if (Inst.isCall()) {
3065 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
3066 // included in such blanket waits.
3067 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
3068 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3069 } else if (TII.isVINTERP(Inst)) {
3070 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3071 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
3072 }
3073
3074 // Set XCNT to zero in the bracket for instructions that implicitly drain
3075 // XCNT.
3076 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
3077 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
3078}
3079
3080bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
3081 unsigned OtherScore) {
3082 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
3083 unsigned OtherShifted =
3084 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
3085 Score = std::max(MyShifted, OtherShifted);
3086 return OtherShifted > MyShifted;
3087}
3088
3089bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
3090 ArrayRef<CounterValueArray> OtherMarks) {
3091 bool StrictDom = false;
3092
3093 LLVM_DEBUG(dbgs() << "Merging async marks ...");
3094 // Early exit: nothing to merge when both sides are empty.
3095 if (AsyncMarks.empty() && OtherMarks.empty()) {
3096 LLVM_DEBUG(dbgs() << " nothing to merge\n");
3097 return false;
3098 }
3099 LLVM_DEBUG(dbgs() << '\n');
3100
3101 // Determine maximum length needed after merging
3102 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
3103 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3104
3105 // Keep only the most recent marks within our limit.
3106 if (AsyncMarks.size() > MaxSize)
3107 AsyncMarks.erase(AsyncMarks.begin(),
3108 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3109
3110 // Pad with zero-filled marks if our list is shorter. Zero represents "no
3111 // pending async operations at this checkpoint" and acts as the identity
3112 // element for max() during merging. We pad at the beginning since the marks
3113 // need to be aligned in most-recent order.
3114 constexpr CounterValueArray ZeroMark{};
3115 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3116
3117 LLVM_DEBUG({
3118 dbgs() << "Before merge:\n";
3119 for (const auto &Mark : AsyncMarks) {
3120 llvm::interleaveComma(Mark, dbgs());
3121 dbgs() << '\n';
3122 }
3123 dbgs() << "Other marks:\n";
3124 for (const auto &Mark : OtherMarks) {
3125 llvm::interleaveComma(Mark, dbgs());
3126 dbgs() << '\n';
3127 }
3128 });
3129
3130 // Merge element-wise using the existing mergeScore function and the
3131 // appropriate MergeInfo for each counter type. Iterate only while we have
3132 // elements in both vectors.
3133 unsigned OtherSize = OtherMarks.size();
3134 unsigned OurSize = AsyncMarks.size();
3135 unsigned MergeCount = std::min(OtherSize, OurSize);
3136 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
3137 // Our existing marks are the conservative result; return early to avoid
3138 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
3139 if (MergeCount == 0)
3140 return StrictDom;
3141 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3142 for (auto T : inst_counter_types(Context->MaxCounter)) {
3143 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3144 OtherMarks[OtherSize - Idx][T]);
3145 }
3146 }
3147
3148 LLVM_DEBUG({
3149 dbgs() << "After merge:\n";
3150 for (const auto &Mark : AsyncMarks) {
3151 llvm::interleaveComma(Mark, dbgs());
3152 dbgs() << '\n';
3153 }
3154 });
3155
3156 return StrictDom;
3157}
3158
3159/// Merge the pending events and associater score brackets of \p Other into
3160/// this brackets status.
3161///
3162/// Returns whether the merge resulted in a change that requires tighter waits
3163/// (i.e. the merged brackets strictly dominate the original brackets).
3164bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3165 bool StrictDom = false;
3166
3167 // Check if "other" has keys we don't have, and create default entries for
3168 // those. If they remain empty after merging, we will clean it up after.
3169 for (auto K : Other.VMem.keys())
3170 VMem.try_emplace(K);
3171 for (auto K : Other.SGPRs.keys())
3172 SGPRs.try_emplace(K);
3173
3174 // Array to store MergeInfo for each counter type
3175 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
3176
3177 for (auto T : inst_counter_types(Context->MaxCounter)) {
3178 // Merge event flags for this counter
3179 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3180 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3181 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3182 if (!OldEvents.contains(OtherEvents))
3183 StrictDom = true;
3184 PendingEvents |= OtherEvents;
3185
3186 // Merge scores for this counter
3187 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3188 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3189 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3190 if (NewUB < ScoreLBs[T])
3191 report_fatal_error("waitcnt score overflow");
3192
3193 MergeInfo &M = MergeInfos[T];
3194 M.OldLB = ScoreLBs[T];
3195 M.OtherLB = Other.ScoreLBs[T];
3196 M.MyShift = NewUB - ScoreUBs[T];
3197 M.OtherShift = NewUB - Other.ScoreUBs[T];
3198
3199 ScoreUBs[T] = NewUB;
3200
3201 if (T == AMDGPU::LOAD_CNT)
3202 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3203
3204 if (T == AMDGPU::DS_CNT) {
3205 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3206 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3207 }
3208
3209 if (T == AMDGPU::KM_CNT) {
3210 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3211 if (Other.hasPendingEvent(SCC_WRITE)) {
3212 if (!OldEvents.contains(SCC_WRITE)) {
3213 PendingSCCWrite = Other.PendingSCCWrite;
3214 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3215 PendingSCCWrite = nullptr;
3216 }
3217 }
3218 }
3219
3220 for (auto &[RegID, Info] : VMem)
3221 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3222
3223 if (isSmemCounter(T)) {
3224 for (auto &[RegID, Info] : SGPRs) {
3225 auto It = Other.SGPRs.find(RegID);
3226 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3227 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3228 }
3229 }
3230 }
3231
3232 for (auto &[TID, Info] : VMem) {
3233 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3234 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3235 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3236 Info.VMEMTypes = NewVmemTypes;
3237 }
3238 }
3239
3240 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3241 for (auto T : inst_counter_types(Context->MaxCounter))
3242 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3243
3244 purgeEmptyTrackingData();
3245 return StrictDom;
3246}
3247
3248static bool isWaitInstr(MachineInstr &Inst) {
3249 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3250 return Opcode == AMDGPU::S_WAITCNT ||
3251 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3252 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3253 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3254 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3255 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3256 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3257 counterTypeForInstr(Opcode).has_value();
3258}
3259
3260void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3262 bool ExpertMode) const {
3263 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3265 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3266 .addImm(ExpertMode ? 2 : 0)
3267 .addImm(EncodedReg);
3268}
3269
3270namespace {
3271// TODO: Remove this work-around after fixing the scheduler.
3272// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3273// and ST.partialVCCWritesUpdateVCCZ().
3274// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3275// corrupt vccz bit, so when we detect that an instruction may read from
3276// a corrupt vccz bit, we need to:
3277// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3278// operations to complete.
3279// 2. Recompute the correct value of vccz by writing the current value
3280// of vcc back to vcc.
3281// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3282// correct value of vccz by reading vcc and writing it back to vcc.
3283// No waitcnt is needed in this case.
3284class VCCZWorkaround {
3285 const WaitcntBrackets &ScoreBrackets;
3286 const GCNSubtarget &ST;
3287 const SIInstrInfo &TII;
3288 const SIRegisterInfo &TRI;
3289 bool VCCZCorruptionBug = false;
3290 bool VCCZNotUpdatedByPartialWrites = false;
3291 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3292 /// to vcc and then issued an smem load, so initialize to true.
3293 bool MustRecomputeVCCZ = true;
3294
3295public:
3296 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3297 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3298 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3299 VCCZCorruptionBug = ST.hasReadVCCZBug();
3300 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3301 }
3302 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3303 /// then emit a vccz recompute instruction before \p MI. This needs to be
3304 /// called on every instruction in the basic block because it also tracks the
3305 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3306 /// modified the IR.
3307 bool tryRecomputeVCCZ(MachineInstr &MI) {
3308 // No need to run this if neither bug is present.
3309 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3310 return false;
3311
3312 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3313 // both to emit a waitcnt and to recompute vccz.
3314 // But we don't actually emit a waitcnt here. This is done in
3315 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3316 // state, and can either skip emitting a waitcnt if there is already one in
3317 // the IR, or emit an "optimized" combined waitcnt.
3318 // If this is an smem read, it could complete and clobber vccz at any time.
3319 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3320
3321 // If the target partial vcc writes don't update vccz, and MI is such an
3322 // instruction then we must recompute vccz.
3323 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3324 // `definesRegister()` more than needed, because it's not very cheap.
3325 std::optional<bool> PartiallyWritesToVCCOpt;
3326 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3327 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3328 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3329 };
3330 if (VCCZNotUpdatedByPartialWrites) {
3331 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3332 // If this is a partial VCC write but won't update vccz, then we must
3333 // recompute vccz.
3334 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3335 }
3336
3337 // If MI is a vcc write with no pending smem, or there is a pending smem
3338 // but the target does not suffer from the vccz corruption bug, then we
3339 // don't need to recompute vccz as this write will recompute it anyway.
3340 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3341 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3342 if (!PartiallyWritesToVCCOpt)
3343 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3344 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3345 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3346 // If we write to the full vcc or we write partially and the target
3347 // updates vccz on partial writes, then vccz will be updated correctly.
3348 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3349 *PartiallyWritesToVCCOpt);
3350 if (UpdatesVCCZ)
3351 MustRecomputeVCCZ = false;
3352 }
3353
3354 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3355 // restore instruction if either is needed.
3356 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3357 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3358 // bit is updated, so we can restore the bit by reading the value of vcc
3359 // and then writing it back to the register.
3360 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3361 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3362 TRI.getVCC())
3363 .addReg(TRI.getVCC());
3364 MustRecomputeVCCZ = false;
3365 return true;
3366 }
3367 return false;
3368 }
3369};
3370
3371} // namespace
3372
3373// Generate s_waitcnt instructions where needed.
3374bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3375 MachineBasicBlock &Block,
3376 WaitcntBrackets &ScoreBrackets) {
3377 bool Modified = false;
3378
3379 LLVM_DEBUG({
3380 dbgs() << "*** Begin Block: ";
3381 Block.printName(dbgs());
3382 ScoreBrackets.dump();
3383 });
3384 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3385
3386 // Walk over the instructions.
3387 MachineInstr *OldWaitcntInstr = nullptr;
3388
3389 // NOTE: We may append instrs after Inst while iterating.
3390 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3391 E = Block.instr_end();
3392 Iter != E; ++Iter) {
3393 MachineInstr &Inst = *Iter;
3394 if (isNonWaitcntMetaInst(Inst))
3395 continue;
3396 // Track pre-existing waitcnts that were added in earlier iterations or by
3397 // the memory legalizer.
3398 if (isWaitInstr(Inst) ||
3399 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3400 if (!OldWaitcntInstr)
3401 OldWaitcntInstr = &Inst;
3402 continue;
3403 }
3404
3405 PreheaderFlushFlags FlushFlags;
3406 if (Block.getFirstTerminator() == Inst)
3407 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3408
3409 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3410 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3411 FlushFlags);
3412 OldWaitcntInstr = nullptr;
3413
3414 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3415 // Asyncmarks record the current wait state and so should not allow
3416 // waitcnts that occur after them to be merged into waitcnts that occur
3417 // before.
3418 ScoreBrackets.recordAsyncMark(Inst);
3419 continue;
3420 }
3421
3422 if (TII.isSMRD(Inst)) {
3423 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3424 // No need to handle invariant loads when avoiding WAR conflicts, as
3425 // there cannot be a vector store to the same memory location.
3426 if (!Memop->isInvariant()) {
3427 const Value *Ptr = Memop->getValue();
3428 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3429 }
3430 }
3431 }
3432
3433 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3434
3435 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3436 // visited by the loop.
3437 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3438
3439 LLVM_DEBUG({
3440 Inst.print(dbgs());
3441 ScoreBrackets.dump();
3442 });
3443
3444 // If the target suffers from the vccz bugs, this may emit the necessary
3445 // vccz recompute instruction before \p Inst if needed.
3446 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3447 }
3448
3449 // Flush counters at the end of the block if needed (for preheaders with no
3450 // terminator).
3451 AMDGPU::Waitcnt Wait;
3452 if (Block.getFirstTerminator() == Block.end()) {
3453 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3454 if (FlushFlags.FlushVmCnt) {
3455 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3456 Wait.set(AMDGPU::LOAD_CNT, 0);
3457 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3458 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3459 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3460 Wait.set(AMDGPU::BVH_CNT, 0);
3461 }
3462 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3463 Wait.set(AMDGPU::DS_CNT, 0);
3464 }
3465
3466 // Combine or remove any redundant waitcnts at the end of the block.
3467 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3468 OldWaitcntInstr);
3469
3470 LLVM_DEBUG({
3471 dbgs() << "*** End Block: ";
3472 Block.printName(dbgs());
3473 ScoreBrackets.dump();
3474 });
3475
3476 return Modified;
3477}
3478
3479bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3480 if (Block.size() <= 1)
3481 return false;
3482 // The Memory Legalizer conservatively inserts a soft xcnt before each
3483 // atomic RMW operation. However, for sequences of back-to-back atomic
3484 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3485 // the redundant soft xcnts.
3486 bool Modified = false;
3487 // Remember the last atomic with a soft xcnt right before it.
3488 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3489
3490 for (MachineInstr &MI : drop_begin(Block)) {
3491 // Ignore last atomic if non-LDS VMEM and SMEM.
3492 bool IsLDS =
3493 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3494 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3495 LastAtomicWithSoftXcnt = nullptr;
3496
3497 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3498 MI.mayLoad() && MI.mayStore();
3499 MachineInstr &PrevMI = *MI.getPrevNode();
3500 // This is an atomic with a soft xcnt.
3501 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3502 // If we have already found an atomic with a soft xcnt, remove this soft
3503 // xcnt as it's redundant.
3504 if (LastAtomicWithSoftXcnt) {
3505 PrevMI.eraseFromParent();
3506 Modified = true;
3507 }
3508 LastAtomicWithSoftXcnt = &MI;
3509 }
3510 }
3511 return Modified;
3512}
3513
3514// Return flags indicating which counters should be flushed in the preheader.
3515PreheaderFlushFlags
3516SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3517 const WaitcntBrackets &ScoreBrackets) {
3518 auto [Iterator, IsInserted] =
3519 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3520 if (!IsInserted)
3521 return Iterator->second;
3522
3523 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3524 if (!Succ)
3525 return PreheaderFlushFlags();
3526
3527 MachineLoop *Loop = MLI.getLoopFor(Succ);
3528 if (!Loop)
3529 return PreheaderFlushFlags();
3530
3531 if (Loop->getLoopPreheader() == &MBB) {
3532 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3533 return Iterator->second;
3534 }
3535
3536 return PreheaderFlushFlags();
3537}
3538
3539bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3541 return TII.mayAccessVMEMThroughFlat(MI);
3542 return SIInstrInfo::isVMEM(MI);
3543}
3544
3545bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3546 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3547}
3548
3549// Check if instruction is a store to LDS that is counted via DSCNT
3550// (where that counter exists).
3551bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3552 return MI.mayStore() && SIInstrInfo::isDS(MI);
3553}
3554
3555// Return flags indicating which counters should be flushed in the preheader of
3556// the given loop. We currently decide to flush in the following situations:
3557// For VMEM (FlushVmCnt):
3558// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3559// vgpr containing a value that is loaded outside of the loop. (Only on
3560// targets with no vscnt counter).
3561// 2. The loop contains vmem load(s), but the loaded values are not used in the
3562// loop, and at least one use of a vgpr containing a value that is loaded
3563// outside of the loop.
3564// For DS (FlushDsCnt, GFX12+ only):
3565// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3566// a value that is DS read outside of the loop.
3567// 4. The loop contains DS read(s), loaded values are not used in the same
3568// iteration but in the next iteration (prefetch pattern), and at least one
3569// use of a vgpr containing a value that is DS read outside of the loop.
3570// Flushing in preheader reduces wait overhead if the wait requirement in
3571// iteration 1 would otherwise be more strict (but unfortunately preheader
3572// flush decision is taken before knowing that).
3573// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3574// tracking. Some DS reads may be used in the same iteration (creating
3575// "flush points"), but others remain unflushed at the backedge. When a DS
3576// read is consumed in the same iteration, it and all prior reads are
3577// "flushed" (FIFO order). No DS writes are allowed in the loop.
3578// TODO: Find a way to extend to multi-block loops.
3579PreheaderFlushFlags
3580SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3581 const WaitcntBrackets &Brackets) {
3582 PreheaderFlushFlags Flags;
3583 bool HasVMemLoad = false;
3584 bool HasVMemStore = false;
3585 bool UsesVgprVMEMLoadedOutside = false;
3586 bool UsesVgprDSReadOutside = false;
3587 bool VMemInvalidated = false;
3588 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3589 // Tracking status for "no DS read in loop" or "pure DS prefetch
3590 // (use only in next iteration)".
3591 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3592 DenseSet<MCRegUnit> VgprUse;
3593 DenseSet<MCRegUnit> VgprDefVMEM;
3594 DenseSet<MCRegUnit> VgprDefDS;
3595
3596 // Track DS reads for prefetch pattern with flush points (single-block only).
3597 // Keeps track of the last DS read (position counted from the top of the loop)
3598 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3599 // the dest register has a use or is overwritten (by any later opertions).
3600 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3601 unsigned DSReadPosition = 0;
3602 bool IsSingleBlock = ML->getNumBlocks() == 1;
3603 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3604 unsigned LastDSFlushPosition = 0;
3605
3606 for (MachineBasicBlock *MBB : ML->blocks()) {
3607 for (MachineInstr &MI : *MBB) {
3608 if (isVMEMOrFlatVMEM(MI)) {
3609 HasVMemLoad |= MI.mayLoad();
3610 HasVMemStore |= MI.mayStore();
3611 }
3612 // TODO: Can we relax DSStore check? There may be cases where
3613 // these DS stores are drained prior to the end of MBB (or loop).
3614 if (mayStoreIncrementingDSCNT(MI)) {
3615 // Early exit if none of the optimizations are feasible.
3616 // Otherwise, set tracking status appropriately and continue.
3617 if (VMemInvalidated)
3618 return Flags;
3619 TrackSimpleDSOpt = false;
3620 TrackDSFlushPoint = false;
3621 }
3622 bool IsDSRead = isDSRead(MI);
3623 if (IsDSRead)
3624 ++DSReadPosition;
3625
3626 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3627 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3628 if (!TrackDSFlushPoint)
3629 return;
3630 if (auto It = LastDSReadPositionMap.find(RU);
3631 It != LastDSReadPositionMap.end()) {
3632 // RU defined by DSRead is used or overwritten. Need to complete
3633 // the read, if not already implied by a later DSRead (to any RU)
3634 // needing to complete in FIFO order.
3635 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3636 }
3637 };
3638
3639 for (const MachineOperand &Op : MI.all_uses()) {
3640 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3641 continue;
3642 // Vgpr use
3643 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3644 // If we find a register that is loaded inside the loop, 1. and 2.
3645 // are invalidated.
3646 if (VgprDefVMEM.contains(RU))
3647 VMemInvalidated = true;
3648
3649 // Check for DS reads used inside the loop
3650 if (VgprDefDS.contains(RU))
3651 TrackSimpleDSOpt = false;
3652
3653 // Early exit if all optimizations are invalidated
3654 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3655 return Flags;
3656
3657 // Check for flush points (DS read used in same iteration)
3658 updateDSReadFlushTracking(RU);
3659
3660 VgprUse.insert(RU);
3661 // Check if this register has a pending VMEM load from outside the
3662 // loop (value loaded outside and used inside).
3663 VMEMID ID = toVMEMID(RU);
3664 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3665 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3666 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3667 UsesVgprVMEMLoadedOutside = true;
3668 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3669 // Only consider it a DS read if there's no pending VMEM load for
3670 // this register, since FLAT can set both counters.
3671 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3672 UsesVgprDSReadOutside = true;
3673 }
3674 }
3675
3676 // VMem load vgpr def
3677 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3678 for (const MachineOperand &Op : MI.all_defs()) {
3679 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3680 // If we find a register that is loaded inside the loop, 1. and 2.
3681 // are invalidated.
3682 if (VgprUse.contains(RU))
3683 VMemInvalidated = true;
3684 VgprDefVMEM.insert(RU);
3685 }
3686 }
3687 // Early exit if all optimizations are invalidated
3688 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3689 return Flags;
3690 }
3691
3692 // DS read vgpr def
3693 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3694 // If USE comes before DEF, it's the prefetch pattern (use value from
3695 // previous iteration, read for next iteration). We should still flush
3696 // in preheader so iteration 1 doesn't need to wait inside the loop.
3697 // Only invalidate when DEF comes before USE (same-iteration consumption,
3698 // checked above when processing uses).
3699 if (IsDSRead || TrackDSFlushPoint) {
3700 for (const MachineOperand &Op : MI.all_defs()) {
3701 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3702 continue;
3703 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3704 // Check for overwrite of pending DS read (flush point) by any
3705 // instruction
3706 updateDSReadFlushTracking(RU);
3707 if (IsDSRead) {
3708 VgprDefDS.insert(RU);
3709 if (TrackDSFlushPoint)
3710 LastDSReadPositionMap[RU] = DSReadPosition;
3711 }
3712 }
3713 }
3714 }
3715 }
3716 }
3717
3718 // VMEM flush decision
3719 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3720 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3721 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3722 Flags.FlushVmCnt = true;
3723
3724 // DS flush decision:
3725 // Simple DS Opt: flush if loop uses DS read values from outside
3726 // and either has no DS reads in the loop, or DS reads whose results
3727 // are not used in the loop.
3728 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3729 // Prefetch with flush points: some DS reads used in same iteration,
3730 // but unflushed reads remain at backedge
3731 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3732 bool DSFlushPointPrefetch =
3733 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3734
3735 if (SimpleDSOpt || DSFlushPointPrefetch)
3736 Flags.FlushDsCnt = true;
3737
3738 return Flags;
3739}
3740
3741bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3742 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3743 auto &PDT =
3744 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3745 AliasAnalysis *AA = nullptr;
3746 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3747 AA = &AAR->getAAResults();
3748
3749 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3750}
3751
3752PreservedAnalyses
3755 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3756 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3758 .getManager()
3759 .getCachedResult<AAManager>(MF.getFunction());
3760
3761 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3762 return PreservedAnalyses::all();
3763
3766 .preserve<AAManager>();
3767}
3768
3769bool SIInsertWaitcnts::run() {
3771
3773
3774 // Initialize hardware limits first, as they're needed by the generators.
3775 Limits = AMDGPU::HardwareLimits(IV);
3776
3777 if (ST.hasExtendedWaitCounts()) {
3778 IsExpertMode = ST.hasExpertSchedulingMode() &&
3779 (ExpertSchedulingModeFlag.getNumOccurrences()
3781 : MF.getFunction()
3782 .getFnAttribute("amdgpu-expert-scheduling-mode")
3783 .getValueAsBool());
3784 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3786 // Initialize WCG per MF. It contains state that depends on MF attributes.
3787 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3788 IsExpertMode);
3789 } else {
3790 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3791 // Initialize WCG per MF. It contains state that depends on MF attributes.
3792 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3793 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3794 }
3795
3796 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3797
3798 bool Modified = false;
3799
3800 MachineBasicBlock &EntryBB = MF.front();
3801
3802 if (!MFI->isEntryFunction() &&
3803 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3804 // Wait for any outstanding memory operations that the input registers may
3805 // depend on. We can't track them and it's better to do the wait after the
3806 // costly call sequence.
3807
3808 // TODO: Could insert earlier and schedule more liberally with operations
3809 // that only use caller preserved registers.
3811 while (I != EntryBB.end() && I->isMetaInstruction())
3812 ++I;
3813
3814 if (ST.hasExtendedWaitCounts()) {
3815 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3816 .addImm(0);
3818 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3819 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3821 continue;
3822
3823 if (!ST.hasImageInsts() &&
3824 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3825 CT == AMDGPU::BVH_CNT))
3826 continue;
3827
3828 BuildMI(EntryBB, I, DebugLoc(),
3829 TII.get(instrsForExtendedCounterTypes[CT]))
3830 .addImm(0);
3831 }
3832 if (IsExpertMode) {
3833 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3835 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3836 .addImm(Enc);
3837 }
3838 } else {
3839 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3840 }
3841
3842 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3843 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3844 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3845
3846 Modified = true;
3847 }
3848
3849 // Keep iterating over the blocks in reverse post order, inserting and
3850 // updating s_waitcnt where needed, until a fix point is reached.
3851 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3852 BlockInfos.try_emplace(MBB);
3853
3854 std::unique_ptr<WaitcntBrackets> Brackets;
3855 bool Repeat;
3856 do {
3857 Repeat = false;
3858
3859 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3860 ++BII) {
3861 MachineBasicBlock *MBB = BII->first;
3862 BlockInfo &BI = BII->second;
3863 if (!BI.Dirty)
3864 continue;
3865
3866 if (BI.Incoming) {
3867 if (!Brackets)
3868 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3869 else
3870 *Brackets = *BI.Incoming;
3871 } else {
3872 if (!Brackets) {
3873 Brackets = std::make_unique<WaitcntBrackets>(this);
3874 } else {
3875 // Reinitialize in-place. N.B. do not do this by assigning from a
3876 // temporary because the WaitcntBrackets class is large and it could
3877 // cause this function to use an unreasonable amount of stack space.
3878 Brackets->~WaitcntBrackets();
3879 new (Brackets.get()) WaitcntBrackets(this);
3880 }
3881 }
3882
3883 if (ST.hasWaitXcnt())
3884 Modified |= removeRedundantSoftXcnts(*MBB);
3885 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3886 BI.Dirty = false;
3887
3888 if (Brackets->hasPendingEvent()) {
3889 BlockInfo *MoveBracketsToSucc = nullptr;
3890 for (MachineBasicBlock *Succ : MBB->successors()) {
3891 auto *SuccBII = BlockInfos.find(Succ);
3892 BlockInfo &SuccBI = SuccBII->second;
3893 if (!SuccBI.Incoming) {
3894 SuccBI.Dirty = true;
3895 if (SuccBII <= BII) {
3896 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3897 Repeat = true;
3898 }
3899 if (!MoveBracketsToSucc) {
3900 MoveBracketsToSucc = &SuccBI;
3901 } else {
3902 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3903 }
3904 } else {
3905 LLVM_DEBUG({
3906 dbgs() << "Try to merge ";
3907 MBB->printName(dbgs());
3908 dbgs() << " into ";
3909 Succ->printName(dbgs());
3910 dbgs() << '\n';
3911 });
3912 if (SuccBI.Incoming->merge(*Brackets)) {
3913 SuccBI.Dirty = true;
3914 if (SuccBII <= BII) {
3915 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3916 Repeat = true;
3917 }
3918 }
3919 }
3920 }
3921 if (MoveBracketsToSucc)
3922 MoveBracketsToSucc->Incoming = std::move(Brackets);
3923 }
3924 }
3925 } while (Repeat);
3926
3927 if (ST.hasScalarStores()) {
3928 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3929 bool HaveScalarStores = false;
3930
3931 for (MachineBasicBlock &MBB : MF) {
3932 for (MachineInstr &MI : MBB) {
3933 if (!HaveScalarStores && TII.isScalarStore(MI))
3934 HaveScalarStores = true;
3935
3936 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3937 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3938 EndPgmBlocks.push_back(&MBB);
3939 }
3940 }
3941
3942 if (HaveScalarStores) {
3943 // If scalar writes are used, the cache must be flushed or else the next
3944 // wave to reuse the same scratch memory can be clobbered.
3945 //
3946 // Insert s_dcache_wb at wave termination points if there were any scalar
3947 // stores, and only if the cache hasn't already been flushed. This could
3948 // be improved by looking across blocks for flushes in postdominating
3949 // blocks from the stores but an explicitly requested flush is probably
3950 // very rare.
3951 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3952 bool SeenDCacheWB = false;
3953
3954 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3955 I != E; ++I) {
3956 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3957 SeenDCacheWB = true;
3958 else if (TII.isScalarStore(*I))
3959 SeenDCacheWB = false;
3960
3961 // FIXME: It would be better to insert this before a waitcnt if any.
3962 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3963 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3964 !SeenDCacheWB) {
3965 Modified = true;
3966 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3967 }
3968 }
3969 }
3970 }
3971 }
3972
3973 if (IsExpertMode) {
3974 // Enable expert scheduling on function entry. To satisfy ABI requirements
3975 // and to allow calls between function with different expert scheduling
3976 // settings, disable it around calls and before returns.
3977
3979 while (I != EntryBB.end() && I->isMetaInstruction())
3980 ++I;
3981 setSchedulingMode(EntryBB, I, true);
3982
3983 for (MachineInstr *MI : CallInsts) {
3984 MachineBasicBlock &MBB = *MI->getParent();
3985 setSchedulingMode(MBB, MI, false);
3986 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3987 }
3988
3989 for (MachineInstr *MI : ReturnInsts)
3990 setSchedulingMode(*MI->getParent(), MI, false);
3991
3992 Modified = true;
3993 }
3994
3995 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3996 // This is done in different ways depending on how the VGPRs were allocated
3997 // (i.e. whether we're in dynamic VGPR mode or not).
3998 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3999 // waveslot limited kernel runs slower with the deallocation.
4000 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
4001 for (auto [MI, _] : EndPgmInsts) {
4002 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4003 TII.get(AMDGPU::S_ALLOC_VGPR))
4004 .addImm(0);
4005 Modified = true;
4006 }
4007 } else if (!WCG->isOptNone() &&
4008 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
4009 (MF.getFrameInfo().hasCalls() ||
4010 ST.getOccupancyWithNumVGPRs(
4011 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
4012 /*IsDynamicVGPR=*/false) <
4014 for (auto [MI, Flag] : EndPgmInsts) {
4015 if (Flag) {
4016 if (ST.requiresNopBeforeDeallocVGPRs()) {
4017 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4018 TII.get(AMDGPU::S_NOP))
4019 .addImm(0);
4020 }
4021 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4022 TII.get(AMDGPU::S_SENDMSG))
4024 Modified = true;
4025 }
4026 }
4027 }
4028
4029 return Modified;
4030}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:759
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:724
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2152
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2172
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.