LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUHWEvents.h"
28#include "AMDGPUWaitcntUtils.h"
29#include "GCNSubtarget.h"
33#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/Sequence.h"
41#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
48
49#define DEBUG_TYPE "si-insert-waitcnts"
50
51static cl::opt<bool>
52 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55 cl::init(false), cl::Hidden);
56
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc("Force all waitcnt load counters to wait until 0"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-expert-scheduling-mode",
64 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
65 cl::init(false), cl::Hidden);
66
67namespace {
68
69template <typename EmitWaitcntFn>
70static void EmitExpandedWaitcnt(unsigned Outstanding, unsigned Target,
71 EmitWaitcntFn &&EmitWaitcnt) {
72 // Emit waitcnts from (Outstanding - 1) down to Target.
73 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
74 EmitWaitcnt(I);
75 EmitWaitcnt(Target);
76}
77
78/// Integer IDs used to track vector memory locations we may have to wait on.
79/// Encoded as u16 chunks:
80///
81/// [0, REGUNITS_END ): MCRegUnit
82/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
83///
84/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
85/// It gives (2 << 16) - 1 entries per category which is more than enough
86/// for all register units. MCPhysReg is u16 so we don't even support >u16
87/// physical register numbers at this time, let alone >u16 register units.
88/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
89/// is enough for all register units.
90using VMEMID = uint32_t;
91
92enum : VMEMID {
93 TRACKINGID_RANGE_LEN = (1 << 16),
94
95 // Important: MCRegUnits must always be tracked starting from 0, as we
96 // need to be able to convert between a MCRegUnit and a VMEMID freely.
97 REGUNITS_BEGIN = 0,
98 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
99
100 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
101 // entry, which is updated for all LDS DMA operations encountered.
102 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
103 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
104 LDSDMA_BEGIN = REGUNITS_END,
105 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
106};
107
108/// Convert a MCRegUnit to a VMEMID.
109static constexpr VMEMID toVMEMID(MCRegUnit RU) {
110 return static_cast<unsigned>(RU);
111}
112
113} // namespace
114
115namespace {
116
117// Maps values of InstCounterType to the instruction that waits on that
118// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
119// returns true, and does not cover VA_VDST or VM_VSRC.
120static const unsigned
121 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
122 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
123 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
124 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
125 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
126 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
127
128// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
129// code but still need to be processed by this pass for async vmcnt tracking.
130static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
131 switch (MI.getOpcode()) {
132 case AMDGPU::ASYNCMARK:
133 case AMDGPU::WAIT_ASYNCMARK:
134 return false;
135 default:
136 return MI.isMetaInstruction();
137 }
138}
139
140static bool updateVMCntOnly(const MachineInstr &Inst) {
141 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
143}
144
145#ifndef NDEBUG
146static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
147 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
148}
149#endif // NDEBUG
150
151class WaitcntBrackets;
152
153// This abstracts the logic for generating and updating S_WAIT* instructions
154// away from the analysis that determines where they are needed. This was
155// done because the set of counters and instructions for waiting on them
156// underwent a major shift with gfx12, sufficiently so that having this
157// abstraction allows the main analysis logic to be simpler than it would
158// otherwise have had to become.
159class WaitcntGenerator {
160protected:
161 const GCNSubtarget &ST;
162 const SIInstrInfo &TII;
163 AMDGPU::IsaVersion IV;
164 AMDGPU::InstCounterType MaxCounter;
165 bool OptNone;
166 bool ExpandWaitcntProfiling = false;
167 const AMDGPU::HardwareLimits &Limits;
168
169public:
170 WaitcntGenerator() = delete;
171 WaitcntGenerator(const WaitcntGenerator &) = delete;
172 WaitcntGenerator(const MachineFunction &MF,
173 AMDGPU::InstCounterType MaxCounter,
174 const AMDGPU::HardwareLimits &Limits)
175 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
176 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
177 OptNone(MF.getFunction().hasOptNone() ||
178 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
179 ExpandWaitcntProfiling(
180 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
181 Limits(Limits) {}
182
183 // Return true if the current function should be compiled with no
184 // optimization.
185 bool isOptNone() const { return OptNone; }
186
187 unsigned getLimit(AMDGPU::InstCounterType E) const { return Limits.get(E); }
188
189 // Edits an existing sequence of wait count instructions according
190 // to an incoming Waitcnt value, which is itself updated to reflect
191 // any new wait count instructions which may need to be generated by
192 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
193 // were made.
194 //
195 // This editing will usually be merely updated operands, but it may also
196 // delete instructions if the incoming Wait value indicates they are not
197 // needed. It may also remove existing instructions for which a wait
198 // is needed if it can be determined that it is better to generate new
199 // instructions later, as can happen on gfx12.
200 virtual bool
201 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
202 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
204
205 // Transform a soft waitcnt into a normal one.
206 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
207
208 // Generates new wait count instructions according to the value of
209 // Wait, returning true if any new instructions were created.
210 // ScoreBrackets is used for profiling expansion.
211 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
213 AMDGPU::Waitcnt Wait,
214 const WaitcntBrackets &ScoreBrackets) = 0;
215
216 // Returns the set of HWEvents that corresponds to counter \p T.
217 virtual HWEvents getWaitEvents(AMDGPU::InstCounterType T) const = 0;
218
219 /// \returns the counter that corresponds to event \p E.
220 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
221 assert(E.size() == 1 && "Cannot handle a mask of events!");
222 for (auto T : AMDGPU::inst_counter_types()) {
223 if (getWaitEvents(T) & E)
224 return T;
225 }
226 llvm_unreachable("event type has no associated counter");
227 }
228
229 // Returns a new waitcnt with all counters except VScnt set to 0. If
230 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
231 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
232 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
233 // processed.
234 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
235
236 virtual ~WaitcntGenerator() = default;
237};
238
239class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
240 static constexpr const HWEvents
241 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
242 HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
243 HWEvents::VMEM_BVH_READ_ACCESS,
244 HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
245 HWEvents::SQ_MESSAGE,
246 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
247 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
248 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
249 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
258
259public:
260 using WaitcntGenerator::WaitcntGenerator;
261 bool
262 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
263 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
264 MachineBasicBlock::instr_iterator It) const override;
265
266 bool createNewWaitcnt(MachineBasicBlock &Block,
268 AMDGPU::Waitcnt Wait,
269 const WaitcntBrackets &ScoreBrackets) override;
270
271 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
272 HWEvents EVs = WaitEventMaskForInstPreGFX12[T];
273 if (T == AMDGPU::LOAD_CNT && !ST.hasVscnt())
274 EVs |= WaitEventMaskForInstPreGFX12[AMDGPU::STORE_CNT];
275 return EVs;
276 }
277
278 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
279};
280
281class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
282protected:
283 bool IsExpertMode;
284 static constexpr const HWEvents
285 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
286 HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
287 HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
288 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
289 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
290 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
291
292 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
293 HWEvents::VMEM_SAMPLER_READ_ACCESS,
294 HWEvents::VMEM_BVH_READ_ACCESS,
295
296 HWEvents::SMEM_ACCESS | HWEvents::SQ_MESSAGE | HWEvents::SCC_WRITE,
297 HWEvents::VMEM_GROUP | HWEvents::SMEM_GROUP,
298 HWEvents::ASYNC_ACCESS,
299 HWEvents::TENSOR_ACCESS,
300 HWEvents::VGPR_CSMACC_WRITE | HWEvents::VGPR_DPMACC_WRITE |
301 HWEvents::VGPR_TRANS_WRITE | HWEvents::VGPR_XDL_WRITE,
302 HWEvents::VGPR_LDS_READ | HWEvents::VGPR_FLAT_READ |
303 HWEvents::VGPR_VMEM_READ};
304
305public:
306 WaitcntGeneratorGFX12Plus() = delete;
307 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
308 AMDGPU::InstCounterType MaxCounter,
309 const AMDGPU::HardwareLimits &Limits,
310 bool IsExpertMode)
311 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
312
313 bool
314 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
315 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
316 MachineBasicBlock::instr_iterator It) const override;
317
318 bool createNewWaitcnt(MachineBasicBlock &Block,
320 AMDGPU::Waitcnt Wait,
321 const WaitcntBrackets &ScoreBrackets) override;
322
323 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
324 return WaitEventMaskForInstGFX12Plus[T];
325 }
326
327 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
328};
329
330// Flags indicating which counters should be flushed in a loop preheader.
331struct PreheaderFlushFlags {
332 bool FlushVmCnt = false;
333 bool FlushDsCnt = false;
334};
335
336class SIInsertWaitcnts {
337 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
338 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
339 MachineLoopInfo &MLI;
340 MachinePostDominatorTree &PDT;
341 AliasAnalysis *AA = nullptr;
342 MachineFunction &MF;
343
344 struct BlockInfo {
345 std::unique_ptr<WaitcntBrackets> Incoming;
346 bool Dirty = true;
347 BlockInfo() = default;
348 BlockInfo(BlockInfo &&) = default;
349 BlockInfo &operator=(BlockInfo &&) = default;
350 ~BlockInfo();
351 };
352
353 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
354
355 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
356
357 std::unique_ptr<WaitcntGenerator> WCG;
358
359 // Remember call and return instructions in the function.
360 DenseSet<MachineInstr *> CallInsts;
361 DenseSet<MachineInstr *> ReturnInsts;
362
363 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
364 // be outstanding stores but definitely no outstanding scratch stores, to help
365 // with insertion of DEALLOC_VGPRS messages.
366 DenseMap<MachineInstr *, bool> EndPgmInsts;
367
368 AMDGPU::HardwareLimits Limits;
369
370public:
371 const GCNSubtarget &ST;
372 const SIInstrInfo &TII;
373 const SIRegisterInfo &TRI;
374 const MachineRegisterInfo &MRI;
375 AMDGPU::InstCounterType SmemAccessCounter;
376 AMDGPU::InstCounterType MaxCounter;
377 bool IsExpertMode = false;
378
379 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
380 AliasAnalysis *AA, MachineFunction &MF)
381 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
382 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
383 MRI(MF.getRegInfo()) {}
384
385 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
386
387 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
388 const WaitcntBrackets &Brackets);
389 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
390 const WaitcntBrackets &ScoreBrackets);
391 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
392 bool isDSRead(const MachineInstr &MI) const;
393 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
394 bool run();
395
396 bool isAsync(const MachineInstr &MI) const {
398 return false;
400 return true;
401 const MachineOperand *Async =
402 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
403 return Async && (Async->getImm());
404 }
405
406 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
407 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
408 }
409
410 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
411 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
412 }
413
414 bool shouldUpdateAsyncMark(const MachineInstr &MI,
417 return T == AMDGPU::TENSOR_CNT;
418 if (!isAsyncLdsDmaWrite(MI))
419 return false;
421 return T == AMDGPU::ASYNC_CNT;
422 return T == AMDGPU::LOAD_CNT;
423 }
424
425 bool isVmemAccess(const MachineInstr &MI) const;
426 bool generateWaitcntInstBefore(MachineInstr &MI,
427 WaitcntBrackets &ScoreBrackets,
428 MachineInstr *OldWaitcntInstr,
429 PreheaderFlushFlags FlushFlags);
430 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
432 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
433 MachineInstr *OldWaitcntInstr);
434 void updateEventWaitcntAfter(MachineInstr &Inst,
435 WaitcntBrackets *ScoreBrackets);
436 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
437 MachineBasicBlock *Block) const;
438 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
439 WaitcntBrackets &ScoreBrackets);
440 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
441 WaitcntBrackets &ScoreBrackets);
442 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
443 /// Legalizer. Returns true if block was modified.
444 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
445 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
446 bool ExpertMode) const;
447 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const {
448 return WCG->getWaitEvents(T);
449 }
450 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
451 return WCG->getCounterFromEvent(E);
452 }
453};
454
455// This objects maintains the current score brackets of each wait counter, and
456// a per-register scoreboard for each wait counter.
457//
458// We also maintain the latest score for every event type that can change the
459// waitcnt in order to know if there are multiple types of events within
460// the brackets. When multiple types of event happen in the bracket,
461// wait count may get decreased out of order, therefore we need to put in
462// "s_waitcnt 0" before use.
463class WaitcntBrackets {
464public:
465 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
466 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
467 }
468
469#ifndef NDEBUG
470 ~WaitcntBrackets() {
471 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
472 for (auto &[ID, Val] : VMem) {
473 if (Val.empty())
474 ++NumUnusedVmem;
475 }
476 for (auto &[ID, Val] : SGPRs) {
477 if (Val.empty())
478 ++NumUnusedSGPRs;
479 }
480
481 if (NumUnusedVmem || NumUnusedSGPRs) {
482 errs() << "WaitcntBracket had unused entries at destruction time: "
483 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
484 << " SGPR unused entries\n";
485 std::abort();
486 }
487 }
488#endif
489
490 bool isSmemCounter(AMDGPU::InstCounterType T) const {
491 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
492 }
493
494 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
495 return ScoreUBs[T] - ScoreLBs[T];
496 }
497
498 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
499 return getVMemScore(ID, T) > getScoreLB(T);
500 }
501
502 /// \Return true if we have no score entries for counter \p T.
503 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
504
505private:
506 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
508 return ScoreLBs[T];
509 }
510
511 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
513 return ScoreUBs[T];
514 }
515
516 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
517 return getScoreUB(T) - getScoreLB(T);
518 }
519
520 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
521 auto It = SGPRs.find(RU);
522 return It != SGPRs.end() ? It->second.get(T) : 0;
523 }
524
525 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
526 auto It = VMem.find(TID);
527 return It != VMem.end() ? It->second.Scores[T] : 0;
528 }
529
530public:
531 bool merge(const WaitcntBrackets &Other);
532
533 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
534 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
535 simplifyWaitcnt(Wait, Wait);
536 }
537 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
538 AMDGPU::Waitcnt &UpdateWait) const;
539 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
540 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
541 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
542 AMDGPU::Waitcnt &UpdateWait) const;
543 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
544 AMDGPU::Waitcnt &UpdateWait) const;
545
546 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
547 AMDGPU::Waitcnt &Wait,
548 const MachineInstr &MI) const;
549 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
551 MCPhysReg Reg) const;
552 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
553 AMDGPU::Waitcnt &Wait) const;
554 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
555 void tryClearSCCWriteEvent(MachineInstr *Inst);
556
557 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
558 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
559 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
560 void updateByEvent(HWEvents E, MachineInstr &MI);
561 void recordAsyncMark(MachineInstr &MI);
562
563 HWEvents getPendingEvents() const { return PendingEvents; }
564 bool hasPendingEvent() const { return PendingEvents.any(); }
565 bool hasPendingEvent(HWEvents E) const { return PendingEvents.contains(E); }
566 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
567 bool HasPending = (PendingEvents & Context->getWaitEvents(T)).any();
568 assert(HasPending == !empty(T) &&
569 "Expected pending events iff scoreboard is not empty");
570 return HasPending;
571 }
572
573 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
574 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
575 // Return true if more than one bit is set in Events.
576 return Events.size() > 1;
577 }
578
579 bool hasPendingFlat() const {
580 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
581 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
582 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
583 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
584 }
585
586 void setPendingFlat() {
587 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
588 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
589 }
590
591 bool hasPendingGDS() const {
592 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
593 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
594 }
595
596 unsigned getPendingGDSWait() const {
597 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
598 getLimit(AMDGPU::DS_CNT) - 1);
599 }
600
601 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
602
603 // Return true if there might be pending writes to the vgpr-interval by VMEM
604 // instructions where the HWEvents in VGPRContext are not contained in E.
605 bool hasDifferentVGPRPendingEvents(MCPhysReg Reg, HWEvents E) const {
606 for (MCRegUnit RU : regunits(Reg)) {
607 auto It = VMem.find(toVMEMID(RU));
608 if (It != VMem.end() && (It->second.VGPRPendingEvents & ~E).any())
609 return true;
610 }
611 return false;
612 }
613
614 void clearVGPRPendingEvents(MCPhysReg Reg) {
615 for (MCRegUnit RU : regunits(Reg)) {
616 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
617 It->second.VGPRPendingEvents = HWEvents::NONE;
618 if (It->second.empty())
619 VMem.erase(It);
620 }
621 }
622 }
623
624 void setStateOnFunctionEntryOrReturn() {
625 setScoreUB(AMDGPU::STORE_CNT,
626 getScoreUB(AMDGPU::STORE_CNT) + getLimit(AMDGPU::STORE_CNT));
627 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
628 }
629
630 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
631 return LDSDMAStores;
632 }
633
634 bool hasPointSampleAccel(const MachineInstr &MI) const;
635 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
636 MCPhysReg RU) const;
637
638 void print(raw_ostream &) const;
639 void dump() const { print(dbgs()); }
640
641 // Free up memory by removing empty entries from the DenseMap that track event
642 // scores.
643 void purgeEmptyTrackingData();
644
645private:
646 unsigned getLimit(AMDGPU::InstCounterType T) const {
647 return Context->getLimits().get(T);
648 }
649
650 struct MergeInfo {
651 unsigned OldLB;
652 unsigned OtherLB;
653 unsigned MyShift;
654 unsigned OtherShift;
655 };
656
657 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
658
659 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
660 AMDGPU::Waitcnt &Wait) const;
661
662 static bool mergeScore(const MergeInfo &M, unsigned &Score,
663 unsigned OtherScore);
664 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
665 ArrayRef<CounterValueArray> OtherMarks);
666
668 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
669 if (!Context->TRI.isInAllocatableClass(Reg))
670 return {{}, {}};
671 return Context->TRI.regunits(Reg);
672 }
673
674 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
676 ScoreLBs[T] = Val;
677 }
678
679 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
681 ScoreUBs[T] = Val;
682
683 if (T != AMDGPU::EXP_CNT)
684 return;
685
686 if (getScoreRange(AMDGPU::EXP_CNT) > getLimit(AMDGPU::EXP_CNT))
687 ScoreLBs[AMDGPU::EXP_CNT] =
688 ScoreUBs[AMDGPU::EXP_CNT] - getLimit(AMDGPU::EXP_CNT);
689 }
690
691 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
692 const SIRegisterInfo &TRI = Context->TRI;
693 if (Reg == AMDGPU::SCC) {
694 SCCScore = Val;
695 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
696 for (MCRegUnit RU : regunits(Reg))
697 VMem[toVMEMID(RU)].Scores[T] = Val;
698 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
699 for (MCRegUnit RU : regunits(Reg))
700 SGPRs[RU].get(T) = Val;
701 } else {
702 llvm_unreachable("Register cannot be tracked/unknown register!");
703 }
704 }
705
706 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
707 VMem[TID].Scores[T] = Val;
708 }
709
710 void setScoreByOperand(const MachineOperand &Op,
711 AMDGPU::InstCounterType CntTy, unsigned Val);
712
713 const SIInsertWaitcnts *Context;
714
715 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
716 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
717 HWEvents PendingEvents;
718 // Remember the last flat memory operation.
719 unsigned LastFlatDsCnt = 0;
720 unsigned LastFlatLoadCnt = 0;
721 // Remember the last GDS operation.
722 unsigned LastGDS = 0;
723
724 // The score tracking logic is fragmented as follows:
725 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
726 // - SGPRs: SGPR RegUnits
727 // - SCC: Non-allocatable and not general purpose: not a SGPR.
728 //
729 // For the VMem case, if the key is within the range of LDS DMA IDs,
730 // then the corresponding index into the `LDSDMAStores` vector below is:
731 // Key - LDSDMA_BEGIN - 1
732 // This is because LDSDMA_BEGIN is a generic entry and does not have an
733 // associated MachineInstr.
734 //
735 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
736
737 struct VMEMInfo {
738 // Scores for all instruction counters. Zero-initialized.
739 CounterValueArray Scores{};
740 // For VGPRs, we need to track an additional fine-grained set of pending
741 // events.
742 HWEvents VGPRPendingEvents;
743
744 bool empty() const {
745 return all_of(Scores, equal_to(0)) && !VGPRPendingEvents;
746 }
747 };
748
749 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
750 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
751 class SGPRInfo {
752 /// Either DS_CNT or KM_CNT score.
753 unsigned ScoreDsKmCnt = 0;
754 unsigned ScoreXCnt = 0;
755
756 public:
757 unsigned get(AMDGPU::InstCounterType T) const {
758 assert(
759 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
760 "Invalid counter");
761 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
762 }
763 unsigned &get(AMDGPU::InstCounterType T) {
764 assert(
765 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
766 "Invalid counter");
767 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
768 }
769
770 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
771 };
772
773 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
774 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
775
776 // Reg score for SCC.
777 unsigned SCCScore = 0;
778 // The unique instruction that has an SCC write pending, if there is one.
779 const MachineInstr *PendingSCCWrite = nullptr;
780
781 // Store representative LDS DMA operations. The only useful info here is
782 // alias info. One store is kept per unique AAInfo.
783 SmallVector<const MachineInstr *> LDSDMAStores;
784
785 // State of all counters at each async mark encountered so far.
787
788 // But in the rare pathological case, a nest of loops that pushes marks
789 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
790 // it to a reasonable limit. We can tune this later or potentially introduce a
791 // user option to control the value.
792 static constexpr unsigned MaxAsyncMarks = 16;
793
794 // Track the upper bound score for async operations that are not part of a
795 // mark yet. Initialized to all zeros.
796 CounterValueArray AsyncScore{};
797};
798
799SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
800
801class SIInsertWaitcntsLegacy : public MachineFunctionPass {
802public:
803 static char ID;
804 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
805
806 bool runOnMachineFunction(MachineFunction &MF) override;
807
808 StringRef getPassName() const override {
809 return "SI insert wait instructions";
810 }
811
812 void getAnalysisUsage(AnalysisUsage &AU) const override {
813 AU.setPreservesCFG();
814 AU.addRequired<MachineLoopInfoWrapperPass>();
815 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
816 AU.addUsedIfAvailable<AAResultsWrapperPass>();
817 AU.addPreserved<AAResultsWrapperPass>();
819 }
820};
821
822} // end anonymous namespace
823
824void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
826 unsigned Score) {
827 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
828}
829
830// Return true if the subtarget is one that enables Point Sample Acceleration
831// and the MachineInstr passed in is one to which it might be applied (the
832// hardware makes this decision based on several factors, but we can't determine
833// this at compile time, so we have to assume it might be applied if the
834// instruction supports it).
835bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
836 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
837 return false;
838
839 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
840 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
842 return BaseInfo->PointSampleAccel;
843}
844
845// Return true if the subtarget enables Point Sample Acceleration, the supplied
846// MachineInstr is one to which it might be applied and the supplied interval is
847// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
848// (this is the type that a point sample accelerated instruction effectively
849// becomes)
850bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
851 MCPhysReg Reg) const {
852 if (!hasPointSampleAccel(MI))
853 return false;
854
855 return hasDifferentVGPRPendingEvents(Reg, HWEvents::VMEM_READ_ACCESS);
856}
857
858void WaitcntBrackets::updateByEvent(HWEvents E, MachineInstr &Inst) {
859 assert(E.size() == 1 && "Expected singular event!");
860 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
861 assert(T < Context->MaxCounter);
862
863 unsigned UB = getScoreUB(T);
864 unsigned Increment = 1;
866 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
867 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
868 // two VOP3P instructions and increments VA_VDST twice.
869 Increment = 2;
870 }
871 unsigned CurrScore = UB + Increment;
872 if (CurrScore == 0)
873 report_fatal_error("InsertWaitcnt score wraparound");
874 // PendingEvents and ScoreUB need to be update regardless if this event
875 // changes the score of a register or not.
876 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
877 PendingEvents |= E;
878 setScoreUB(T, CurrScore);
879
880 const SIRegisterInfo &TRI = Context->TRI;
881 const MachineRegisterInfo &MRI = Context->MRI;
882 const SIInstrInfo &TII = Context->TII;
883
884 if (T == AMDGPU::EXP_CNT) {
885 // Put score on the source vgprs. If this is a store, just use those
886 // specific register(s).
887 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
888 // All GDS operations must protect their address register (same as
889 // export.)
890 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
891 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
892
893 if (Inst.mayStore()) {
894 if (const auto *Data0 =
895 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
896 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
897 if (const auto *Data1 =
898 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
899 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
900 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
901 Inst.getOpcode() != AMDGPU::DS_APPEND &&
902 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
903 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
904 for (const MachineOperand &Op : Inst.all_uses()) {
905 if (TRI.isVectorRegister(MRI, Op.getReg()))
906 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
907 }
908 }
909 } else if (TII.isFLAT(Inst)) {
910 if (Inst.mayStore()) {
911 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
912 AMDGPU::EXP_CNT, CurrScore);
913 } else if (SIInstrInfo::isAtomicRet(Inst)) {
914 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
915 AMDGPU::EXP_CNT, CurrScore);
916 }
917 } else if (TII.isMIMG(Inst)) {
918 if (Inst.mayStore()) {
919 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
920 } else if (SIInstrInfo::isAtomicRet(Inst)) {
921 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
922 AMDGPU::EXP_CNT, CurrScore);
923 }
924 } else if (TII.isMTBUF(Inst)) {
925 if (Inst.mayStore())
926 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
927 } else if (TII.isMUBUF(Inst)) {
928 if (Inst.mayStore()) {
929 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
930 } else if (SIInstrInfo::isAtomicRet(Inst)) {
931 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
932 AMDGPU::EXP_CNT, CurrScore);
933 }
934 } else if (TII.isLDSDIR(Inst)) {
935 // LDSDIR instructions attach the score to the destination.
936 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
937 AMDGPU::EXP_CNT, CurrScore);
938 } else {
939 if (TII.isEXP(Inst)) {
940 // For export the destination registers are really temps that
941 // can be used as the actual source after export patching, so
942 // we need to treat them like sources and set the EXP_CNT
943 // score.
944 for (MachineOperand &DefMO : Inst.all_defs()) {
945 if (TRI.isVGPR(MRI, DefMO.getReg())) {
946 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
947 }
948 }
949 }
950 for (const MachineOperand &Op : Inst.all_uses()) {
951 if (TRI.isVectorRegister(MRI, Op.getReg()))
952 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
953 }
954 }
955 } else if (T == AMDGPU::X_CNT) {
956 HWEvents OtherEvent =
957 E == HWEvents::SMEM_GROUP ? HWEvents::VMEM_GROUP : HWEvents::SMEM_GROUP;
958 if (PendingEvents.contains(OtherEvent)) {
959 // Hardware inserts an implicit xcnt between interleaved
960 // SMEM and VMEM operations. So there will never be
961 // outstanding address translations for both SMEM and
962 // VMEM at the same time.
963 setScoreLB(T, getScoreUB(T) - 1);
964 PendingEvents -= OtherEvent;
965 }
966 for (const MachineOperand &Op : Inst.all_uses())
967 setScoreByOperand(Op, T, CurrScore);
968 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
969 // Match the score to the VGPR destination or source registers as
970 // appropriate
971 for (const MachineOperand &Op : Inst.operands()) {
972 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
973 (T == AMDGPU::VM_VSRC && Op.isDef()))
974 continue;
975 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
976 setScoreByOperand(Op, T, CurrScore);
977 }
978 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
979 // Match the score to the destination registers.
980 //
981 // Check only explicit operands. Stores, especially spill stores, include
982 // implicit uses and defs of their super registers which would create an
983 // artificial dependency, while these are there only for register liveness
984 // accounting purposes.
985 //
986 // Special cases where implicit register defs exists, such as M0 or VCC,
987 // but none with memory instructions.
988 for (const MachineOperand &Op : Inst.defs()) {
989 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
990 T == AMDGPU::BVH_CNT) {
991 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
992 continue;
993 if (updateVMCntOnly(Inst)) {
994 // updateVMCntOnly should only leave us with VGPRs
995 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
996 // defs.
997 assert(TRI.isVectorRegister(MRI, Op.getReg()));
998 HWEvents VGPRContext =
1000 // If instruction can have Point Sample Accel applied, we have to flag
1001 // this with another potential dependency
1002 if (hasPointSampleAccel(Inst))
1003 VGPRContext |= HWEvents::VMEM_READ_ACCESS;
1004 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1005 VMem[toVMEMID(RU)].VGPRPendingEvents |= VGPRContext;
1006 }
1007 }
1008 setScoreByOperand(Op, T, CurrScore);
1009 }
1010 if (Inst.mayStore() &&
1011 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1012 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1013 // written can be accessed. A load from LDS to VMEM does not need a wait.
1014 //
1015 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1016 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1017 // store. The "Slot" is the index into LDSDMAStores + 1.
1018 unsigned Slot = 0;
1019 for (const auto *MemOp : Inst.memoperands()) {
1020 if (!MemOp->isStore() ||
1021 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1022 continue;
1023 // Comparing just AA info does not guarantee memoperands are equal
1024 // in general, but this is so for LDS DMA in practice.
1025 auto AAI = MemOp->getAAInfo();
1026 // Alias scope information gives a way to definitely identify an
1027 // original memory object and practically produced in the module LDS
1028 // lowering pass. If there is no scope available we will not be able
1029 // to disambiguate LDS aliasing as after the module lowering all LDS
1030 // is squashed into a single big object.
1031 if (!AAI || !AAI.Scope)
1032 break;
1033 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1034 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1035 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1036 Slot = I + 1;
1037 break;
1038 }
1039 }
1040 }
1041 if (Slot)
1042 break;
1043 // The slot may not be valid because it can be >= NUM_LDSDMA which
1044 // means the scoreboard cannot track it. We still want to preserve the
1045 // MI in order to check alias information, though.
1046 LDSDMAStores.push_back(&Inst);
1047 Slot = LDSDMAStores.size();
1048 break;
1049 }
1050 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1051 if (Slot && Slot < NUM_LDSDMA)
1052 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1053 }
1054
1055 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1056 AsyncScore[T] = CurrScore;
1057 }
1058
1060 setRegScore(AMDGPU::SCC, T, CurrScore);
1061 PendingSCCWrite = &Inst;
1062 }
1063 }
1064}
1065
1066void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1067 // In the absence of loops, AsyncMarks can grow linearly with the program
1068 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1069 // limit every time we push a new mark, but that seems like unnecessary work
1070 // in practical cases. We do separately truncate the array when processing a
1071 // loop, which should be sufficient.
1072 AsyncMarks.push_back(AsyncScore);
1073 AsyncScore = {};
1074 LLVM_DEBUG({
1075 dbgs() << "recordAsyncMark:\n" << Inst;
1076 for (const auto &Mark : AsyncMarks) {
1077 llvm::interleaveComma(Mark, dbgs());
1078 dbgs() << '\n';
1079 }
1080 });
1081}
1082
1083void WaitcntBrackets::print(raw_ostream &OS) const {
1084 const GCNSubtarget &ST = Context->ST;
1085
1086 for (auto T : inst_counter_types(Context->MaxCounter)) {
1087 unsigned SR = getScoreRange(T);
1088 switch (T) {
1089 case AMDGPU::LOAD_CNT:
1090 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1091 << SR << "):";
1092 break;
1093 case AMDGPU::DS_CNT:
1094 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1095 << SR << "):";
1096 break;
1097 case AMDGPU::EXP_CNT:
1098 OS << " EXP_CNT(" << SR << "):";
1099 break;
1100 case AMDGPU::STORE_CNT:
1101 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1102 << SR << "):";
1103 break;
1104 case AMDGPU::SAMPLE_CNT:
1105 OS << " SAMPLE_CNT(" << SR << "):";
1106 break;
1107 case AMDGPU::BVH_CNT:
1108 OS << " BVH_CNT(" << SR << "):";
1109 break;
1110 case AMDGPU::KM_CNT:
1111 OS << " KM_CNT(" << SR << "):";
1112 break;
1113 case AMDGPU::X_CNT:
1114 OS << " X_CNT(" << SR << "):";
1115 break;
1116 case AMDGPU::ASYNC_CNT:
1117 OS << " ASYNC_CNT(" << SR << "):";
1118 break;
1119 case AMDGPU::VA_VDST:
1120 OS << " VA_VDST(" << SR << "): ";
1121 break;
1122 case AMDGPU::VM_VSRC:
1123 OS << " VM_VSRC(" << SR << "): ";
1124 break;
1125 default:
1126 OS << " UNKNOWN(" << SR << "):";
1127 break;
1128 }
1129
1130 if (SR != 0) {
1131 // Print vgpr scores.
1132 unsigned LB = getScoreLB(T);
1133
1134 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1135 sort(SortedVMEMIDs);
1136
1137 for (auto ID : SortedVMEMIDs) {
1138 unsigned RegScore = VMem.at(ID).Scores[T];
1139 if (RegScore <= LB)
1140 continue;
1141 unsigned RelScore = RegScore - LB - 1;
1142 if (ID < REGUNITS_END) {
1143 OS << ' ' << RelScore << ":vRU" << ID;
1144 } else {
1145 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1146 "Unhandled/unexpected ID value!");
1147 OS << ' ' << RelScore << ":LDSDMA" << ID;
1148 }
1149 }
1150
1151 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1152 if (isSmemCounter(T)) {
1153 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1154 sort(SortedSMEMIDs);
1155 for (auto ID : SortedSMEMIDs) {
1156 unsigned RegScore = SGPRs.at(ID).get(T);
1157 if (RegScore <= LB)
1158 continue;
1159 unsigned RelScore = RegScore - LB - 1;
1160 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1161 }
1162 }
1163
1164 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1165 OS << ' ' << SCCScore << ":scc";
1166 }
1167 OS << '\n';
1168 }
1169
1170 OS << "Pending Events: ";
1171 if (hasPendingEvent()) {
1172 OS << getPendingEvents();
1173 } else {
1174 OS << "none";
1175 }
1176 OS << '\n';
1177
1178 OS << "Async score: ";
1179 if (AsyncScore.empty())
1180 OS << "none";
1181 else
1182 llvm::interleaveComma(AsyncScore, OS);
1183 OS << '\n';
1184
1185 OS << "Async marks: " << AsyncMarks.size() << '\n';
1186
1187 for (const auto &Mark : AsyncMarks) {
1188 for (auto T : AMDGPU::inst_counter_types()) {
1189 unsigned MarkedScore = Mark[T];
1190 switch (T) {
1191 case AMDGPU::LOAD_CNT:
1192 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1193 << "_CNT: " << MarkedScore;
1194 break;
1195 case AMDGPU::DS_CNT:
1196 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1197 << "_CNT: " << MarkedScore;
1198 break;
1199 case AMDGPU::EXP_CNT:
1200 OS << " EXP_CNT: " << MarkedScore;
1201 break;
1202 case AMDGPU::STORE_CNT:
1203 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1204 << "_CNT: " << MarkedScore;
1205 break;
1206 case AMDGPU::SAMPLE_CNT:
1207 OS << " SAMPLE_CNT: " << MarkedScore;
1208 break;
1209 case AMDGPU::BVH_CNT:
1210 OS << " BVH_CNT: " << MarkedScore;
1211 break;
1212 case AMDGPU::KM_CNT:
1213 OS << " KM_CNT: " << MarkedScore;
1214 break;
1215 case AMDGPU::X_CNT:
1216 OS << " X_CNT: " << MarkedScore;
1217 break;
1218 case AMDGPU::ASYNC_CNT:
1219 OS << " ASYNC_CNT: " << MarkedScore;
1220 break;
1221 default:
1222 OS << " UNKNOWN: " << MarkedScore;
1223 break;
1224 }
1225 }
1226 OS << '\n';
1227 }
1228 OS << '\n';
1229}
1230
1231/// Simplify \p UpdateWait by removing waits that are redundant based on the
1232/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1233void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1234 AMDGPU::Waitcnt &UpdateWait) const {
1235 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1236 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1237 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1238 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1239 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1240 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1241 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1242 simplifyXcnt(CheckWait, UpdateWait);
1243 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1244 simplifyVmVsrc(CheckWait, UpdateWait);
1245 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1246}
1247
1248void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1249 unsigned &Count) const {
1250 // The number of outstanding events for this type, T, can be calculated
1251 // as (UB - LB). If the current Count is greater than or equal to the number
1252 // of outstanding events, then the wait for this counter is redundant.
1253 if (Count >= getScoreRange(T))
1254 Count = ~0u;
1255}
1256
1257void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1258 AMDGPU::InstCounterType T) const {
1259 unsigned Cnt = Wait.get(T);
1260 simplifyWaitcnt(T, Cnt);
1261 Wait.set(T, Cnt);
1262}
1263
1264void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1265 AMDGPU::Waitcnt &UpdateWait) const {
1266 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1267 // optimizations. On entry to a block with multiple predescessors, there may
1268 // be pending SMEM and VMEM events active at the same time.
1269 // In such cases, only clear one active event at a time.
1270 // TODO: Revisit xcnt optimizations for gfx1250.
1271 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1272 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1273 // zero.
1274 if (CheckWait.get(AMDGPU::KM_CNT) == 0 &&
1275 hasPendingEvent(HWEvents::SMEM_GROUP))
1276 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1277 // If we have pending store we cannot optimize XCnt because we do not wait for
1278 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1279 // decremented to the same number as LOADCnt.
1280 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u &&
1281 hasPendingEvent(HWEvents::VMEM_GROUP) &&
1282 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1283 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1284 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1285 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1286}
1287
1288void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1289 AMDGPU::Waitcnt &UpdateWait) const {
1290 // Waiting for some counters implies waiting for VM_VSRC, since an
1291 // instruction that decrements a counter on completion would have
1292 // decremented VM_VSRC once its VGPR operands had been read.
1293 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1294 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1295 CheckWait.get(AMDGPU::STORE_CNT),
1296 CheckWait.get(AMDGPU::SAMPLE_CNT),
1297 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1298 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1299 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1300}
1301
1302void WaitcntBrackets::purgeEmptyTrackingData() {
1303 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1304 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1305}
1306
1307void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1308 unsigned ScoreToWait,
1309 AMDGPU::Waitcnt &Wait) const {
1310 const unsigned LB = getScoreLB(T);
1311 const unsigned UB = getScoreUB(T);
1312
1313 // If the score falls within the bracket, we need a waitcnt.
1314 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1315 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1316 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1317 // If there is a pending FLAT operation, and this is a VMem or LGKM
1318 // waitcnt and the target can report early completion, then we need
1319 // to force a waitcnt 0.
1320 Wait.add(T, 0);
1321 } else if (counterOutOfOrder(T)) {
1322 // Counter can get decremented out-of-order when there
1323 // are multiple types event in the bracket. Also emit an s_wait counter
1324 // with a conservative value of 0 for the counter.
1325 Wait.add(T, 0);
1326 } else {
1327 // If a counter has been maxed out avoid overflow by waiting for
1328 // MAX(CounterType) - 1 instead.
1329 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(T) - 1);
1330 Wait.add(T, NeededWait);
1331 }
1332 }
1333}
1334
1335AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1336 LLVM_DEBUG({
1337 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1338 << ":\n";
1339 for (const auto &Mark : AsyncMarks) {
1340 llvm::interleaveComma(Mark, dbgs());
1341 dbgs() << '\n';
1342 }
1343 });
1344
1345 if (AsyncMarks.size() == MaxAsyncMarks) {
1346 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1347 // MaxAsyncMarks is linear when traversing straightline code. But we do
1348 // need to check if truncation may have occured at a merge, and adjust N
1349 // to ensure that a wait is generated.
1350 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1351 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1352 }
1353
1354 AMDGPU::Waitcnt Wait;
1355 if (AsyncMarks.size() <= N) {
1356 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1357 return Wait;
1358 }
1359
1360 size_t MarkIndex = AsyncMarks.size() - N - 1;
1361 const auto &RequiredMark = AsyncMarks[MarkIndex];
1363 determineWaitForScore(T, RequiredMark[T], Wait);
1364
1365 // Immediately remove the waited mark and all older ones
1366 // This happens BEFORE the wait is actually inserted, which is fine
1367 // because we've already extracted the wait requirements
1368 LLVM_DEBUG({
1369 dbgs() << "Removing " << (MarkIndex + 1)
1370 << " async marks after determining wait\n";
1371 });
1372 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1373
1374 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1375 return Wait;
1376}
1377
1378// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1379// other half 16bit.
1380//
1381// Replace VGPR16 to VGPR32 for wait check if:
1382// 1. MI is a VALU, and there is a wait event on the other half
1383// 2. MI is a LdSt, and there is a wait event on the other half from different
1384// order group
1385MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1387 MCPhysReg Reg) const {
1388 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1389 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1390
1391 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1392 return Reg;
1393
1394 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1395 // check dependency on the other half
1396 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1397 Register OtherHalf = Context->TRI.getSubReg(
1398 Reg32,
1399 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1400
1401 AMDGPU::Waitcnt Wait;
1402 for (MCRegUnit RU : regunits(OtherHalf))
1403 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1404
1405 // No wait on otherhalf
1406 if (!Wait.hasWait())
1407 return Reg;
1408
1409 if (Context->TII.isVALU(MI, /*AllowLDSDMA=*/true))
1410 return Reg32;
1411
1412 // If hi/lo16 mixed events
1413 HWEvents MIEvents =
1414 AMDGPU::getEventsFor(MI, Context->ST, Context->IsExpertMode);
1415 HWEvents OtherHalfEvents = Context->getWaitEvents(T);
1416 HWEvents Events = MIEvents & OtherHalfEvents;
1417 if (Events.size() > 1)
1418 return Reg32;
1419 return Reg;
1420}
1421
1422void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1423 MCPhysReg Reg,
1424 AMDGPU::Waitcnt &Wait,
1425 const MachineInstr &MI) const {
1426 if (Reg == AMDGPU::SCC) {
1427 determineWaitForScore(T, SCCScore, Wait);
1428 } else {
1429 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1430 if (IsVGPR)
1431 Reg = determineVGPR16Dependency(MI, T, Reg);
1432 for (MCRegUnit RU : regunits(Reg))
1433 determineWaitForScore(
1434 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1435 Wait);
1436 }
1437}
1438
1439void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1440 VMEMID TID,
1441 AMDGPU::Waitcnt &Wait) const {
1442 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1443 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1444}
1445
1446void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1447 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1448 // SCC has landed
1449 if (PendingSCCWrite &&
1450 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1451 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1452 HWEvents SCC_WRITE_PendingEvent = HWEvents::SCC_WRITE;
1453 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1454 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1455 SCC_WRITE_PendingEvent) {
1456 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1457 }
1458
1459 PendingEvents -= SCC_WRITE_PendingEvent;
1460 PendingSCCWrite = nullptr;
1461 }
1462}
1463
1464void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1466 applyWaitcnt(Wait, T);
1467}
1468
1469void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1470 const unsigned UB = getScoreUB(T);
1471 if (Count >= UB)
1472 return;
1473 if (Count != 0) {
1474 if (counterOutOfOrder(T))
1475 return;
1476 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1477 } else {
1478 setScoreLB(T, UB);
1479 PendingEvents -= Context->getWaitEvents(T);
1480 }
1481
1482 if (T == AMDGPU::KM_CNT && Count == 0 &&
1483 hasPendingEvent(HWEvents::SMEM_GROUP)) {
1484 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1485 applyWaitcnt(AMDGPU::X_CNT, 0);
1486 else
1487 PendingEvents -= HWEvents::SMEM_GROUP;
1488 }
1489 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(HWEvents::VMEM_GROUP) &&
1490 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1491 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1492 applyWaitcnt(AMDGPU::X_CNT, Count);
1493 else if (Count == 0)
1494 PendingEvents -= HWEvents::VMEM_GROUP;
1495 }
1496}
1497
1498void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1500 unsigned Cnt = Wait.get(T);
1501 applyWaitcnt(T, Cnt);
1502}
1503
1504// Where there are multiple types of event in the bracket of a counter,
1505// the decrement may go out of order.
1506bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1507 // Scalar memory read always can go out of order.
1508 if ((T == Context->SmemAccessCounter &&
1509 hasPendingEvent(HWEvents::SMEM_ACCESS)) ||
1510 (T == AMDGPU::X_CNT && hasPendingEvent(HWEvents::SMEM_GROUP)))
1511 return true;
1512
1513 if (T == AMDGPU::LOAD_CNT) {
1514
1515 // On targets without VScnt, LOAD_CNT includes all of STORE_CNT as well.
1516 // All these events use one counter and do not go out of order with respect
1517 // to each other.
1518 if (!Context->ST.hasVscnt())
1519 return false;
1520
1521 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
1522
1523 // If the target does not have extended counters, VMEM_BVH/SAMPLE_READ
1524 // events are equivalent to VMEM_READ_ACCESS. We do not go out of order in
1525 // such cases.
1526 static constexpr HWEvents ExtendedImageEvents =
1527 HWEvents::VMEM_SAMPLER_READ_ACCESS | HWEvents::VMEM_BVH_READ_ACCESS;
1528 if (!Context->ST.hasExtendedWaitCounts() &&
1529 (Events & ExtendedImageEvents).any()) {
1530 Events -= ExtendedImageEvents; // TODO: Tests pass even if I only use
1531 // VMEM_SAMPLER_READ_ACCESS which isn't
1532 // normal; indicates weak testing coverage
1533 Events |= HWEvents::VMEM_READ_ACCESS;
1534 }
1535
1536 // GLOBAL_INV completes in-order with other LOAD_CNT events,
1537 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
1538 // events doesn't cause out-of-order completion.
1539 Events -= HWEvents::GLOBAL_INV_ACCESS;
1540
1541 // Return true only if there are still multiple event types after removing
1542 // GLOBAL_INV
1543 return Events.size() > 1;
1544 }
1545
1546 return hasMixedPendingEvents(T);
1547}
1548
1549INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1550 false, false)
1553INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1555
1556char SIInsertWaitcntsLegacy::ID = 0;
1557
1558char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1559
1561 return new SIInsertWaitcntsLegacy();
1562}
1563
1564static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1565 unsigned NewEnc) {
1566 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1567 assert(OpIdx >= 0);
1568
1569 MachineOperand &MO = MI.getOperand(OpIdx);
1570
1571 if (NewEnc == MO.getImm())
1572 return false;
1573
1574 MO.setImm(NewEnc);
1575 return true;
1576}
1577
1578bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1579 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1580 if (Opcode == Waitcnt->getOpcode())
1581 return false;
1582
1583 Waitcnt->setDesc(TII.get(Opcode));
1584 return true;
1585}
1586
1587/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1588/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1589/// from \p Wait that were added by previous passes. Currently this pass
1590/// conservatively assumes that these preexisting waits are required for
1591/// correctness.
1592bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1593 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1594 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1595 assert(isNormalMode(MaxCounter));
1596
1597 bool Modified = false;
1598 MachineInstr *WaitcntInstr = nullptr;
1599 MachineInstr *WaitcntVsCntInstr = nullptr;
1600
1601 LLVM_DEBUG({
1602 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1603 if (It.isEnd())
1604 dbgs() << "end of block\n";
1605 else
1606 dbgs() << *It;
1607 });
1608
1609 for (auto &II :
1610 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1611 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1612 if (isNonWaitcntMetaInst(II)) {
1613 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1614 continue;
1615 }
1616
1617 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1618 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1619
1620 // Update required wait count. If this is a soft waitcnt (= it was added
1621 // by an earlier pass), it may be entirely removed.
1622 if (Opcode == AMDGPU::S_WAITCNT) {
1623 unsigned IEnc = II.getOperand(0).getImm();
1624 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1625 if (TrySimplify)
1626 ScoreBrackets.simplifyWaitcnt(OldWait);
1627 Wait = Wait.combined(OldWait);
1628
1629 // Merge consecutive waitcnt of the same type by erasing multiples.
1630 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1631 II.eraseFromParent();
1632 Modified = true;
1633 } else
1634 WaitcntInstr = &II;
1635 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1636 assert(ST.hasVMemToLDSLoad());
1637 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1638 << "Before: " << Wait << '\n';);
1639 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1640 Wait);
1641 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1642
1643 // It is possible (but unlikely) that this is the only wait instruction,
1644 // in which case, we exit this loop without a WaitcntInstr to consume
1645 // `Wait`. But that works because `Wait` was passed in by reference, and
1646 // the callee eventually calls createNewWaitcnt on it. We test this
1647 // possibility in an articial MIR test since such a situation cannot be
1648 // recreated by running the memory legalizer.
1649 II.eraseFromParent();
1650 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1651 unsigned N = II.getOperand(0).getImm();
1652 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1653 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1654 Wait = Wait.combined(OldWait);
1655 } else {
1656 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1657 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1658
1659 unsigned OldVSCnt =
1660 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1661 if (TrySimplify)
1662 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1664 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1665
1666 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1667 II.eraseFromParent();
1668 Modified = true;
1669 } else
1670 WaitcntVsCntInstr = &II;
1671 }
1672 }
1673
1674 if (WaitcntInstr) {
1675 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1677 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1678
1679 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1680 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1681 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1682 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1683 Wait.set(AMDGPU::EXP_CNT, ~0u);
1684 Wait.set(AMDGPU::DS_CNT, ~0u);
1685
1686 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1687 << "New Instr at block end: "
1688 << *WaitcntInstr << '\n'
1689 : dbgs() << "applied pre-existing waitcnt\n"
1690 << "Old Instr: " << *It
1691 << "New Instr: " << *WaitcntInstr << '\n');
1692 }
1693
1694 if (WaitcntVsCntInstr) {
1695 Modified |=
1696 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1697 Wait.get(AMDGPU::STORE_CNT));
1698 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1699
1700 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1701 Wait.set(AMDGPU::STORE_CNT, ~0u);
1702
1703 LLVM_DEBUG(It.isEnd()
1704 ? dbgs() << "applied pre-existing waitcnt\n"
1705 << "New Instr at block end: " << *WaitcntVsCntInstr
1706 << '\n'
1707 : dbgs() << "applied pre-existing waitcnt\n"
1708 << "Old Instr: " << *It
1709 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1710 }
1711
1712 return Modified;
1713}
1714
1715/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1716/// required counters in \p Wait
1717bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1718 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1719 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1720 assert(isNormalMode(MaxCounter));
1721
1722 bool Modified = false;
1723 const DebugLoc &DL = Block.findDebugLoc(It);
1724
1725 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1726 // single instruction while VScnt has its own instruction.
1727 if (Wait.hasWaitExceptStoreCnt()) {
1728 // If profiling expansion is enabled, emit an expanded sequence
1729 if (ExpandWaitcntProfiling) {
1730 // Check if any of the counters to be waited on are out-of-order.
1731 // If so, fall back to normal (non-expanded) behavior since expansion
1732 // would provide misleading profiling information.
1733 bool AnyOutOfOrder = false;
1734 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1735 unsigned WaitCnt = Wait.get(CT);
1736 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1737 AnyOutOfOrder = true;
1738 break;
1739 }
1740 }
1741
1742 if (AnyOutOfOrder) {
1743 // Fall back to non-expanded wait
1744 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1745 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1746 Modified = true;
1747 } else {
1748 // All counters are in-order, safe to expand
1749 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1750 unsigned WaitCnt = Wait.get(CT);
1751 if (WaitCnt == ~0u)
1752 continue;
1753
1754 unsigned Outstanding =
1755 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1756 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1757 AMDGPU::Waitcnt W;
1758 W.set(CT, Count);
1759 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1761 });
1762 Modified = true;
1763 }
1764 }
1765 } else {
1766 // Normal behavior: emit single combined waitcnt
1767 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1768 [[maybe_unused]] auto SWaitInst =
1769 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1770 Modified = true;
1771
1772 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1773 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1774 dbgs() << "New Instr: " << *SWaitInst << '\n');
1775 }
1776 }
1777
1778 if (Wait.hasWaitStoreCnt()) {
1779 assert(ST.hasVscnt());
1780
1781 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
1782 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
1783 // Only expand if counter is not out-of-order
1784 unsigned Outstanding =
1785 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
1786 getLimit(AMDGPU::STORE_CNT) - 1);
1787 EmitExpandedWaitcnt(
1788 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
1789 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1790 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1791 .addImm(Count);
1792 });
1793 Modified = true;
1794 } else {
1795 [[maybe_unused]] auto SWaitInst =
1796 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1797 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1799 Modified = true;
1800
1801 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1802 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1803 dbgs() << "New Instr: " << *SWaitInst << '\n');
1804 }
1805 }
1806
1807 return Modified;
1808}
1809
1810AMDGPU::Waitcnt
1811WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1812 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1813}
1814
1815AMDGPU::Waitcnt
1816WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1817 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1818 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1819 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
1820 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
1821}
1822
1823/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1824/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1825/// were added by previous passes. Currently this pass conservatively
1826/// assumes that these preexisting waits are required for correctness.
1827bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1828 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1829 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1830 assert(!isNormalMode(MaxCounter));
1831
1832 bool Modified = false;
1833 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1834 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1835 MachineInstr *WaitcntDepctrInstr = nullptr;
1836 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
1837
1838 LLVM_DEBUG({
1839 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1840 if (It.isEnd())
1841 dbgs() << "end of block\n";
1842 else
1843 dbgs() << *It;
1844 });
1845
1846 // Accumulate waits that should not be simplified.
1847 AMDGPU::Waitcnt RequiredWait;
1848
1849 for (auto &II :
1850 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1851 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1852 if (isNonWaitcntMetaInst(II)) {
1853 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1854 continue;
1855 }
1856
1857 // Update required wait count. If this is a soft waitcnt (= it was added
1858 // by an earlier pass), it may be entirely removed.
1859
1860 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1861 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1862
1863 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1864 // attempt to do more than that either.
1865 if (Opcode == AMDGPU::S_WAITCNT)
1866 continue;
1867
1868 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1869 unsigned OldEnc =
1870 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1871 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1872 if (TrySimplify)
1873 Wait = Wait.combined(OldWait);
1874 else
1875 RequiredWait = RequiredWait.combined(OldWait);
1876 // Keep the first wait_loadcnt, erase the rest.
1877 if (CombinedLoadDsCntInstr == nullptr) {
1878 CombinedLoadDsCntInstr = &II;
1879 } else {
1880 II.eraseFromParent();
1881 Modified = true;
1882 }
1883 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1884 unsigned OldEnc =
1885 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1886 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1887 if (TrySimplify)
1888 Wait = Wait.combined(OldWait);
1889 else
1890 RequiredWait = RequiredWait.combined(OldWait);
1891 // Keep the first wait_storecnt, erase the rest.
1892 if (CombinedStoreDsCntInstr == nullptr) {
1893 CombinedStoreDsCntInstr = &II;
1894 } else {
1895 II.eraseFromParent();
1896 Modified = true;
1897 }
1898 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1899 unsigned OldEnc =
1900 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1901 AMDGPU::Waitcnt OldWait;
1904 if (TrySimplify)
1905 ScoreBrackets.simplifyWaitcnt(OldWait);
1906 Wait = Wait.combined(OldWait);
1907 if (WaitcntDepctrInstr == nullptr) {
1908 WaitcntDepctrInstr = &II;
1909 } else {
1910 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1911 // duplicate if it is waiting on things other than VA_VDST or
1912 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1913 // VM_VSRC subfields of the operand are set to the "no wait"
1914 // values.
1915
1916 unsigned Enc =
1917 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1918 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1919 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1920
1921 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
1922 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1923 Modified |= promoteSoftWaitCnt(&II);
1924 } else {
1925 II.eraseFromParent();
1926 Modified = true;
1927 }
1928 }
1929 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1930 // Architectures higher than GFX10 do not have direct loads to
1931 // LDS, so no work required here yet.
1932 II.eraseFromParent();
1933 Modified = true;
1934 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1935 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
1936 // shows up in the assembly as a comment with the original parameter N.
1937 unsigned N = II.getOperand(0).getImm();
1938 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1939 Wait = Wait.combined(OldWait);
1940 } else {
1941 std::optional<AMDGPU::InstCounterType> CT =
1943 assert(CT.has_value());
1944 unsigned OldCnt =
1945 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1946 if (TrySimplify)
1947 Wait.add(CT.value(), OldCnt);
1948 else
1949 RequiredWait.add(CT.value(), OldCnt);
1950 // Keep the first wait of its kind, erase the rest.
1951 if (WaitInstrs[CT.value()] == nullptr) {
1952 WaitInstrs[CT.value()] = &II;
1953 } else {
1954 II.eraseFromParent();
1955 Modified = true;
1956 }
1957 }
1958 }
1959
1960 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1961 Wait = Wait.combined(RequiredWait);
1962
1963 if (CombinedLoadDsCntInstr) {
1964 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1965 // to be waited for. Otherwise, let the instruction be deleted so
1966 // the appropriate single counter wait instruction can be inserted
1967 // instead, when new S_WAIT_*CNT instructions are inserted by
1968 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1969 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1970 // the loop below that deals with single counter instructions.
1971 //
1972 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1973 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1974 // will have needed to wait for their register sources to be available
1975 // first.
1976 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
1977 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1978 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1979 AMDGPU::OpName::simm16, NewEnc);
1980 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1981 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
1982 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
1983 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1984 Wait.set(AMDGPU::DS_CNT, ~0u);
1985
1986 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1987 << "New Instr at block end: "
1988 << *CombinedLoadDsCntInstr << '\n'
1989 : dbgs() << "applied pre-existing waitcnt\n"
1990 << "Old Instr: " << *It << "New Instr: "
1991 << *CombinedLoadDsCntInstr << '\n');
1992 } else {
1993 CombinedLoadDsCntInstr->eraseFromParent();
1994 Modified = true;
1995 }
1996 }
1997
1998 if (CombinedStoreDsCntInstr) {
1999 // Similarly for S_WAIT_STORECNT_DSCNT.
2000 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2001 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2002 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2003 AMDGPU::OpName::simm16, NewEnc);
2004 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2005 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2006 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2007 Wait.set(AMDGPU::STORE_CNT, ~0u);
2008 Wait.set(AMDGPU::DS_CNT, ~0u);
2009
2010 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2011 << "New Instr at block end: "
2012 << *CombinedStoreDsCntInstr << '\n'
2013 : dbgs() << "applied pre-existing waitcnt\n"
2014 << "Old Instr: " << *It << "New Instr: "
2015 << *CombinedStoreDsCntInstr << '\n');
2016 } else {
2017 CombinedStoreDsCntInstr->eraseFromParent();
2018 Modified = true;
2019 }
2020 }
2021
2022 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2023 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2024 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2025 // instructions so that createNewWaitcnt() will create new combined
2026 // instructions to replace them.
2027
2028 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2029 // This is a vector of addresses in WaitInstrs pointing to instructions
2030 // that should be removed if they are present.
2032
2033 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2034 // both) need to be waited for, ensure that there are no existing
2035 // individual wait count instructions for these.
2036
2037 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2038 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2039 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2040 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2041 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2042 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2043 }
2044
2045 for (MachineInstr **WI : WaitsToErase) {
2046 if (!*WI)
2047 continue;
2048
2049 (*WI)->eraseFromParent();
2050 *WI = nullptr;
2051 Modified = true;
2052 }
2053 }
2054
2056 if (!WaitInstrs[CT])
2057 continue;
2058
2059 unsigned NewCnt = Wait.get(CT);
2060 if (NewCnt != ~0u) {
2061 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2062 AMDGPU::OpName::simm16, NewCnt);
2063 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2064
2065 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2066 Wait.clear(CT);
2067
2068 LLVM_DEBUG(It.isEnd()
2069 ? dbgs() << "applied pre-existing waitcnt\n"
2070 << "New Instr at block end: " << *WaitInstrs[CT]
2071 << '\n'
2072 : dbgs() << "applied pre-existing waitcnt\n"
2073 << "Old Instr: " << *It
2074 << "New Instr: " << *WaitInstrs[CT] << '\n');
2075 } else {
2076 WaitInstrs[CT]->eraseFromParent();
2077 Modified = true;
2078 }
2079 }
2080
2081 if (WaitcntDepctrInstr) {
2082 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2083 // subfields with the new required values.
2084 unsigned Enc =
2085 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2086 ->getImm();
2089
2090 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2091 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2092 Wait.set(AMDGPU::VA_VDST, ~0u);
2093 Wait.set(AMDGPU::VM_VSRC, ~0u);
2094
2095 // If that new encoded Depctr immediate would actually still wait
2096 // for anything, update the instruction's operand. Otherwise it can
2097 // just be deleted.
2098 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2099 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2100 AMDGPU::OpName::simm16, Enc);
2101 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2102 << "New Instr at block end: "
2103 << *WaitcntDepctrInstr << '\n'
2104 : dbgs() << "applyPreexistingWaitcnt\n"
2105 << "Old Instr: " << *It << "New Instr: "
2106 << *WaitcntDepctrInstr << '\n');
2107 } else {
2108 WaitcntDepctrInstr->eraseFromParent();
2109 Modified = true;
2110 }
2111 }
2112
2113 return Modified;
2114}
2115
2116/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2117bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2118 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2119 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2120 assert(!isNormalMode(MaxCounter));
2121
2122 bool Modified = false;
2123 const DebugLoc &DL = Block.findDebugLoc(It);
2124
2125 // For GFX12+, we use separate wait instructions, which makes expansion
2126 // simpler
2127 if (ExpandWaitcntProfiling) {
2129 unsigned Count = Wait.get(CT);
2130 if (Count == ~0u)
2131 continue;
2132
2133 // Skip expansion for out-of-order counters - emit normal wait instead
2134 if (ScoreBrackets.counterOutOfOrder(CT)) {
2135 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2136 .addImm(Count);
2137 Modified = true;
2138 continue;
2139 }
2140
2141 unsigned Outstanding =
2142 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2143 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2144 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2145 .addImm(Val);
2146 });
2147 Modified = true;
2148 }
2149 return Modified;
2150 }
2151
2152 // Normal behavior (no expansion)
2153 // Check for opportunities to use combined wait instructions.
2154 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2155 MachineInstr *SWaitInst = nullptr;
2156
2157 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2158 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2159
2160 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2161 .addImm(Enc);
2162
2163 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2164 Wait.set(AMDGPU::DS_CNT, ~0u);
2165 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2166 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2167
2168 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2169 .addImm(Enc);
2170
2171 Wait.set(AMDGPU::STORE_CNT, ~0u);
2172 Wait.set(AMDGPU::DS_CNT, ~0u);
2173 }
2174
2175 if (SWaitInst) {
2176 Modified = true;
2177
2178 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2179 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2180 dbgs() << "New Instr: " << *SWaitInst << '\n');
2181 }
2182 }
2183
2184 // Generate an instruction for any remaining counter that needs
2185 // waiting for.
2186
2188 unsigned Count = Wait.get(CT);
2189 if (Count == ~0u)
2190 continue;
2191
2192 [[maybe_unused]] auto SWaitInst =
2193 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2194 .addImm(Count);
2195
2196 Modified = true;
2197
2198 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2199 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2200 dbgs() << "New Instr: " << *SWaitInst << '\n');
2201 }
2202
2203 if (Wait.hasWaitDepctr()) {
2204 assert(IsExpertMode);
2205 unsigned Enc =
2208
2209 [[maybe_unused]] auto SWaitInst =
2210 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2211
2212 Modified = true;
2213
2214 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2215 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2216 dbgs() << "New Instr: " << *SWaitInst << '\n');
2217 }
2218
2219 return Modified;
2220}
2221
2222/// Generate s_waitcnt instruction to be placed before cur_Inst.
2223/// Instructions of a given type are returned in order,
2224/// but instructions of different types can complete out of order.
2225/// We rely on this in-order completion
2226/// and simply assign a score to the memory access instructions.
2227/// We keep track of the active "score bracket" to determine
2228/// if an access of a memory read requires an s_waitcnt
2229/// and if so what the value of each counter is.
2230/// The "score bracket" is bound by the lower bound and upper bound
2231/// scores (*_score_LB and *_score_ub respectively).
2232/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2233/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2234/// (GFX12+ only, where DS_CNT is a separate counter).
2235bool SIInsertWaitcnts::generateWaitcntInstBefore(
2236 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2237 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2238 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2239
2240 assert(!isNonWaitcntMetaInst(MI));
2241
2242 AMDGPU::Waitcnt Wait;
2243 const unsigned Opc = MI.getOpcode();
2244
2245 switch (Opc) {
2246 case AMDGPU::BUFFER_WBINVL1:
2247 case AMDGPU::BUFFER_WBINVL1_SC:
2248 case AMDGPU::BUFFER_WBINVL1_VOL:
2249 case AMDGPU::BUFFER_GL0_INV:
2250 case AMDGPU::BUFFER_GL1_INV: {
2251 // FIXME: This should have already been handled by the memory legalizer.
2252 // Removing this currently doesn't affect any lit tests, but we need to
2253 // verify that nothing was relying on this. The number of buffer invalidates
2254 // being handled here should not be expanded.
2255 Wait.set(AMDGPU::LOAD_CNT, 0);
2256 break;
2257 }
2258 case AMDGPU::SI_RETURN_TO_EPILOG:
2259 case AMDGPU::SI_RETURN:
2260 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2261 case AMDGPU::S_SETPC_B64_return: {
2262 // All waits must be resolved at call return.
2263 // NOTE: this could be improved with knowledge of all call sites or
2264 // with knowledge of the called routines.
2265 ReturnInsts.insert(&MI);
2266 AMDGPU::Waitcnt AllZeroWait =
2267 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2268 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2269 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2270 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2271 // no need to wait for it at function boundaries.
2272 if (ST.hasExtendedWaitCounts() &&
2273 !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
2274 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2275 Wait = AllZeroWait;
2276 break;
2277 }
2278 case AMDGPU::S_ENDPGM:
2279 case AMDGPU::S_ENDPGM_SAVED: {
2280 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2281 // Technically the hardware will do this on its own if we don't, but that
2282 // might cost extra cycles compared to doing it explicitly.
2283 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2284 // have to wait for outstanding VMEM stores. In this case it can be useful
2285 // to send a message to explicitly release all VGPRs before the stores have
2286 // completed, but it is only safe to do this if there are no outstanding
2287 // scratch stores.
2288 EndPgmInsts[&MI] =
2289 !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2290 !ScoreBrackets.hasPendingEvent(HWEvents::SCRATCH_WRITE_ACCESS);
2291 break;
2292 }
2293 case AMDGPU::S_SENDMSG:
2294 case AMDGPU::S_SENDMSGHALT: {
2295 if (ST.hasLegacyGeometry() &&
2296 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2298 // Resolve vm waits before gs-done.
2299 Wait.set(AMDGPU::LOAD_CNT, 0);
2300 break;
2301 }
2302 [[fallthrough]];
2303 }
2304 default: {
2305
2306 // Export & GDS instructions do not read the EXEC mask until after the
2307 // export is granted (which can occur well after the instruction is issued).
2308 // The shader program must flush all EXP operations on the export-count
2309 // before overwriting the EXEC mask.
2310 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2311 // Export and GDS are tracked individually, either may trigger a waitcnt
2312 // for EXEC.
2313 if (ScoreBrackets.hasPendingEvent(HWEvents::EXP_GPR_LOCK) ||
2314 ScoreBrackets.hasPendingEvent(HWEvents::EXP_PARAM_ACCESS) ||
2315 ScoreBrackets.hasPendingEvent(HWEvents::EXP_POS_ACCESS) ||
2316 ScoreBrackets.hasPendingEvent(HWEvents::GDS_GPR_LOCK)) {
2317 Wait.set(AMDGPU::EXP_CNT, 0);
2318 }
2319 }
2320
2321 // Wait for any pending GDS instruction to complete before any
2322 // "Always GDS" instruction.
2323 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2324 Wait.add(AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2325
2326 if (MI.isCall()) {
2327 // The function is going to insert a wait on everything in its prolog.
2328 // This still needs to be careful if the call target is a load (e.g. a GOT
2329 // load). We also need to check WAW dependency with saved PC.
2330 CallInsts.insert(&MI);
2331 Wait = AMDGPU::Waitcnt();
2332
2333 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2334 if (CallAddrOp.isReg()) {
2335 ScoreBrackets.determineWaitForPhysReg(
2336 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2337
2338 if (const auto *RtnAddrOp =
2339 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2340 ScoreBrackets.determineWaitForPhysReg(
2341 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2342 }
2343 }
2344 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2345 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2346 } else {
2347 // FIXME: Should not be relying on memoperands.
2348 // Look at the source operands of every instruction to see if
2349 // any of them results from a previous memory operation that affects
2350 // its current usage. If so, an s_waitcnt instruction needs to be
2351 // emitted.
2352 // If the source operand was defined by a load, add the s_waitcnt
2353 // instruction.
2354 //
2355 // Two cases are handled for destination operands:
2356 // 1) If the destination operand was defined by a load, add the s_waitcnt
2357 // instruction to guarantee the right WAW order.
2358 // 2) If a destination operand that was used by a recent export/store ins,
2359 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2360
2361 for (const MachineMemOperand *Memop : MI.memoperands()) {
2362 const Value *Ptr = Memop->getValue();
2363 if (Memop->isStore()) {
2364 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2365 Wait.add(SmemAccessCounter, 0);
2366 if (PDT.dominates(MI.getParent(), It->second))
2367 SLoadAddresses.erase(It);
2368 }
2369 }
2370 unsigned AS = Memop->getAddrSpace();
2372 continue;
2373 // No need to wait before load from VMEM to LDS.
2374 if (TII.mayWriteLDSThroughDMA(MI))
2375 continue;
2376
2377 // LOAD_CNT is only relevant to vgpr or LDS.
2378 unsigned TID = LDSDMA_BEGIN;
2379 if (Ptr && Memop->getAAInfo()) {
2380 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2381 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2382 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2383 if ((I + 1) >= NUM_LDSDMA) {
2384 // We didn't have enough slot to track this LDS DMA store, it
2385 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2386 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2387 Wait);
2388 break;
2389 }
2390
2391 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2392 TID + I + 1, Wait);
2393 }
2394 }
2395 } else {
2396 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2397 }
2398 if (Memop->isStore()) {
2399 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2400 }
2401 }
2402
2403 // Loop over use and def operands.
2404 for (const MachineOperand &Op : MI.operands()) {
2405 if (!Op.isReg())
2406 continue;
2407
2408 // If the instruction does not read tied source, skip the operand.
2409 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2410 continue;
2411
2412 MCPhysReg Reg = Op.getReg().asMCReg();
2413
2414 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2415 if (IsVGPR) {
2416 // Implicit VGPR defs and uses are never a part of the memory
2417 // instructions description and usually present to account for
2418 // super-register liveness.
2419 // TODO: Most of the other instructions also have implicit uses
2420 // for the liveness accounting only.
2421 if (Op.isImplicit() && MI.mayLoadOrStore())
2422 continue;
2423
2424 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2425 if (Op.isDef())
2426 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2427 MI);
2428 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2429 // previous write and this write are the same type of VMEM
2430 // instruction, in which case they are (in some architectures)
2431 // guaranteed to write their results in order anyway.
2432 // Additionally check instructions where Point Sample Acceleration
2433 // might be applied.
2434 if (Op.isUse() || !updateVMCntOnly(MI) ||
2435 ScoreBrackets.hasDifferentVGPRPendingEvents(
2437 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2438 !ST.hasVmemWriteVgprInOrder()) {
2439 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2440 MI);
2441 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2442 MI);
2443 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2444 MI);
2445 ScoreBrackets.clearVGPRPendingEvents(Reg);
2446 }
2447
2448 if (Op.isDef() ||
2449 ScoreBrackets.hasPendingEvent(HWEvents::EXP_LDS_ACCESS)) {
2450 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2451 MI);
2452 }
2453 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2454 } else if (Op.getReg() == AMDGPU::SCC) {
2455 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2456 } else {
2457 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2458 MI);
2459 }
2460
2461 if (ST.hasWaitXcnt() && Op.isDef())
2462 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2463 }
2464 }
2465 }
2466 }
2467
2468 // Ensure safety against exceptions from outstanding memory operations while
2469 // waiting for a barrier:
2470 //
2471 // * Some subtargets safely handle backing off the barrier in hardware
2472 // when an exception occurs.
2473 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2474 // there can be no outstanding memory operations during the wait.
2475 // * Subtargets with split barriers don't need to back off the barrier; it
2476 // is up to the trap handler to preserve the user barrier state correctly.
2477 //
2478 // In all other cases, ensure safety by ensuring that there are no outstanding
2479 // memory operations.
2480 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2481 !ST.hasBackOffBarrier()) {
2482 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2483 }
2484
2485 // TODO: Remove this work-around, enable the assert for Bug 457939
2486 // after fixing the scheduler. Also, the Shader Compiler code is
2487 // independent of target.
2488 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2489 ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS)) {
2490 Wait.set(AMDGPU::DS_CNT, 0);
2491 }
2492
2493 // Verify that the wait is actually needed.
2494 ScoreBrackets.simplifyWaitcnt(Wait);
2495
2496 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2497 // waits on VA_VDST if the instruction it would precede is not a VALU
2498 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2499 // expert scheduling mode.
2500 if (TII.isVALU(MI, /*AllowLDSDMA=*/true) && !SIInstrInfo::isLDSDMA(MI))
2501 Wait.set(AMDGPU::VA_VDST, ~0u);
2502
2503 // Since the translation for VMEM addresses occur in-order, we can apply the
2504 // XCnt if the current instruction is of VMEM type and has a memory
2505 // dependency with another VMEM instruction in flight.
2506 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2507 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2508 Wait.set(AMDGPU::X_CNT, ~0u);
2509 }
2510
2511 // When forcing emit, we need to skip terminators because that would break the
2512 // terminators of the MBB if we emit a waitcnt between terminators.
2513 if (ForceEmitZeroFlag && !MI.isTerminator())
2514 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2515
2516 // If we force waitcnt then update Wait accordingly.
2518 if (!ForceEmitWaitcnt[T])
2519 continue;
2520 Wait.set(T, 0);
2521 }
2522
2523 if (FlushFlags.FlushVmCnt) {
2526 Wait.set(T, 0);
2527 }
2528
2529 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2530 Wait.set(AMDGPU::DS_CNT, 0);
2531
2532 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2533 Wait.set(AMDGPU::LOAD_CNT, 0);
2534
2535 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2536 OldWaitcntInstr);
2537}
2538
2539bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2541 MachineBasicBlock &Block,
2542 WaitcntBrackets &ScoreBrackets,
2543 MachineInstr *OldWaitcntInstr) {
2544 bool Modified = false;
2545
2546 if (OldWaitcntInstr)
2547 // Try to merge the required wait with preexisting waitcnt instructions.
2548 // Also erase redundant waitcnt.
2549 Modified =
2550 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2551
2552 // ExpCnt can be merged into VINTERP.
2553 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2555 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2556 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2557 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2558 Modified = true;
2559 }
2560 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2561 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2562 Wait.set(AMDGPU::EXP_CNT, ~0u);
2563
2564 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2565 << "Update Instr: " << *It);
2566 }
2567
2568 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2569 Modified = true;
2570
2571 // Any counts that could have been applied to any existing waitcnt
2572 // instructions will have been done so, now deal with any remaining.
2573 ScoreBrackets.applyWaitcnt(Wait);
2574
2575 return Modified;
2576}
2577
2578bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2579 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2580 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2581}
2582
2583// Return true if the next instruction is S_ENDPGM, following fallthrough
2584// blocks if necessary.
2585bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2586 MachineBasicBlock *Block) const {
2587 auto BlockEnd = Block->getParent()->end();
2588 auto BlockIter = Block->getIterator();
2589
2590 while (true) {
2591 if (It.isEnd()) {
2592 if (++BlockIter != BlockEnd) {
2593 It = BlockIter->instr_begin();
2594 continue;
2595 }
2596
2597 return false;
2598 }
2599
2600 if (!It->isMetaInstruction())
2601 break;
2602
2603 It++;
2604 }
2605
2606 assert(!It.isEnd());
2607
2608 return It->getOpcode() == AMDGPU::S_ENDPGM;
2609}
2610
2611// Add a wait after an instruction if architecture requirements mandate one.
2612bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2613 MachineBasicBlock &Block,
2614 WaitcntBrackets &ScoreBrackets) {
2615 AMDGPU::Waitcnt Wait;
2616 bool NeedsEndPGMCheck = false;
2617
2618 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2619 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2621
2622 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2623 Wait.set(AMDGPU::DS_CNT, 0);
2624 NeedsEndPGMCheck = true;
2625 }
2626
2627 ScoreBrackets.simplifyWaitcnt(Wait);
2628
2629 auto SuccessorIt = std::next(Inst.getIterator());
2630 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2631 /*OldWaitcntInstr=*/nullptr);
2632
2633 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2634 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2635 .addImm(0);
2636 }
2637
2638 return Result;
2639}
2640
2641void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2642 WaitcntBrackets *ScoreBrackets) {
2643
2644 HWEvents InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
2645 for (HWEvents E : InstEvents)
2646 ScoreBrackets->updateByEvent(E, Inst);
2647
2648 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2649 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2650 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2651 ScoreBrackets->setPendingGDS();
2652 }
2653 } else if (TII.isFLAT(Inst)) {
2654 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
2655 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
2656 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2657 // pointers. They do have two operands that each access global and LDS,
2658 // thus making it appear at this point that they are using a flat pointer.
2659 // Filter them out, and for the rest, generate a dependency on flat
2660 // pointers so that both VM and LGKM counters are flushed.
2661 ScoreBrackets->setPendingFlat();
2662 }
2663 } else if (Inst.isCall()) {
2664 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
2665 // included in such blanket waits.
2666 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2667 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2668 } else if (TII.isVINTERP(Inst)) {
2669 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2670 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
2671 }
2672
2673 // Set XCNT to zero in the bracket for instructions that implicitly drain
2674 // XCNT.
2675 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
2676 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
2677}
2678
2679bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2680 unsigned OtherScore) {
2681 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2682 unsigned OtherShifted =
2683 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2684 Score = std::max(MyShifted, OtherShifted);
2685 return OtherShifted > MyShifted;
2686}
2687
2688bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2689 ArrayRef<CounterValueArray> OtherMarks) {
2690 bool StrictDom = false;
2691
2692 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2693 // Early exit: nothing to merge when both sides are empty.
2694 if (AsyncMarks.empty() && OtherMarks.empty()) {
2695 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2696 return false;
2697 }
2698 LLVM_DEBUG(dbgs() << '\n');
2699
2700 // Determine maximum length needed after merging
2701 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
2702 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2703
2704 // Keep only the most recent marks within our limit.
2705 if (AsyncMarks.size() > MaxSize)
2706 AsyncMarks.erase(AsyncMarks.begin(),
2707 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2708
2709 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2710 // pending async operations at this checkpoint" and acts as the identity
2711 // element for max() during merging. We pad at the beginning since the marks
2712 // need to be aligned in most-recent order.
2713 constexpr CounterValueArray ZeroMark{};
2714 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2715
2716 LLVM_DEBUG({
2717 dbgs() << "Before merge:\n";
2718 for (const auto &Mark : AsyncMarks) {
2719 llvm::interleaveComma(Mark, dbgs());
2720 dbgs() << '\n';
2721 }
2722 dbgs() << "Other marks:\n";
2723 for (const auto &Mark : OtherMarks) {
2724 llvm::interleaveComma(Mark, dbgs());
2725 dbgs() << '\n';
2726 }
2727 });
2728
2729 // Merge element-wise using the existing mergeScore function and the
2730 // appropriate MergeInfo for each counter type. Iterate only while we have
2731 // elements in both vectors.
2732 unsigned OtherSize = OtherMarks.size();
2733 unsigned OurSize = AsyncMarks.size();
2734 unsigned MergeCount = std::min(OtherSize, OurSize);
2735 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
2736 // Our existing marks are the conservative result; return early to avoid
2737 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
2738 if (MergeCount == 0)
2739 return StrictDom;
2740 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
2741 for (auto T : inst_counter_types(Context->MaxCounter)) {
2742 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
2743 OtherMarks[OtherSize - Idx][T]);
2744 }
2745 }
2746
2747 LLVM_DEBUG({
2748 dbgs() << "After merge:\n";
2749 for (const auto &Mark : AsyncMarks) {
2750 llvm::interleaveComma(Mark, dbgs());
2751 dbgs() << '\n';
2752 }
2753 });
2754
2755 return StrictDom;
2756}
2757
2758/// Merge the pending events and associater score brackets of \p Other into
2759/// this brackets status.
2760///
2761/// Returns whether the merge resulted in a change that requires tighter waits
2762/// (i.e. the merged brackets strictly dominate the original brackets).
2763bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2764 bool StrictDom = false;
2765
2766 // Check if "other" has keys we don't have, and create default entries for
2767 // those. If they remain empty after merging, we will clean it up after.
2768 for (auto K : Other.VMem.keys())
2769 VMem.try_emplace(K);
2770 for (auto K : Other.SGPRs.keys())
2771 SGPRs.try_emplace(K);
2772
2773 // Array to store MergeInfo for each counter type
2774 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
2775
2776 for (auto T : inst_counter_types(Context->MaxCounter)) {
2777 // Merge event flags for this counter
2778 const HWEvents &EventsForT = Context->getWaitEvents(T);
2779 const HWEvents OldEvents = PendingEvents & EventsForT;
2780 const HWEvents OtherEvents = Other.PendingEvents & EventsForT;
2781 if (!OldEvents.contains(OtherEvents))
2782 StrictDom = true;
2783 PendingEvents |= OtherEvents;
2784
2785 // Merge scores for this counter
2786 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2787 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2788 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2789 if (NewUB < ScoreLBs[T])
2790 report_fatal_error("waitcnt score overflow");
2791
2792 MergeInfo &M = MergeInfos[T];
2793 M.OldLB = ScoreLBs[T];
2794 M.OtherLB = Other.ScoreLBs[T];
2795 M.MyShift = NewUB - ScoreUBs[T];
2796 M.OtherShift = NewUB - Other.ScoreUBs[T];
2797
2798 ScoreUBs[T] = NewUB;
2799
2800 if (T == AMDGPU::LOAD_CNT)
2801 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
2802
2803 if (T == AMDGPU::DS_CNT) {
2804 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
2805 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2806 }
2807
2808 if (T == AMDGPU::KM_CNT) {
2809 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2810 if (Other.hasPendingEvent(HWEvents::SCC_WRITE)) {
2811 if (!(OldEvents & HWEvents::SCC_WRITE)) {
2812 PendingSCCWrite = Other.PendingSCCWrite;
2813 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2814 PendingSCCWrite = nullptr;
2815 }
2816 }
2817 }
2818
2819 for (auto &[RegID, Info] : VMem)
2820 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2821
2822 if (isSmemCounter(T)) {
2823 for (auto &[RegID, Info] : SGPRs) {
2824 auto It = Other.SGPRs.find(RegID);
2825 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
2826 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
2827 }
2828 }
2829 }
2830
2831 for (auto &[TID, Info] : VMem) {
2832 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2833 HWEvents NewVGPRContext =
2834 Info.VGPRPendingEvents | It->second.VGPRPendingEvents;
2835 StrictDom |= NewVGPRContext != Info.VGPRPendingEvents;
2836 Info.VGPRPendingEvents = NewVGPRContext;
2837 }
2838 }
2839
2840 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
2841 for (auto T : inst_counter_types(Context->MaxCounter))
2842 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
2843
2844 purgeEmptyTrackingData();
2845 return StrictDom;
2846}
2847
2848static bool isWaitInstr(MachineInstr &Inst) {
2849 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2850 return Opcode == AMDGPU::S_WAITCNT ||
2851 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2852 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2853 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2854 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2855 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2856 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2857 AMDGPU::counterTypeForInstr(Opcode).has_value();
2858}
2859
2860void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2862 bool ExpertMode) const {
2863 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2865 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
2866 .addImm(ExpertMode ? 2 : 0)
2867 .addImm(EncodedReg);
2868}
2869
2870namespace {
2871// TODO: Remove this work-around after fixing the scheduler.
2872// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
2873// and ST.partialVCCWritesUpdateVCCZ().
2874// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
2875// corrupt vccz bit, so when we detect that an instruction may read from
2876// a corrupt vccz bit, we need to:
2877// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2878// operations to complete.
2879// 2. Recompute the correct value of vccz by writing the current value
2880// of vcc back to vcc.
2881// ii. Partial writes to vcc don't update vccz, so we need to recompute the
2882// correct value of vccz by reading vcc and writing it back to vcc.
2883// No waitcnt is needed in this case.
2884class VCCZWorkaround {
2885 const WaitcntBrackets &ScoreBrackets;
2886 const GCNSubtarget &ST;
2887 const SIInstrInfo &TII;
2888 const SIRegisterInfo &TRI;
2889 bool VCCZCorruptionBug = false;
2890 bool VCCZNotUpdatedByPartialWrites = false;
2891 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
2892 /// to vcc and then issued an smem load, so initialize to true.
2893 bool MustRecomputeVCCZ = true;
2894
2895public:
2896 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
2897 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
2898 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
2899 VCCZCorruptionBug = ST.hasReadVCCZBug();
2900 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
2901 }
2902 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
2903 /// then emit a vccz recompute instruction before \p MI. This needs to be
2904 /// called on every instruction in the basic block because it also tracks the
2905 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
2906 /// modified the IR.
2907 bool tryRecomputeVCCZ(MachineInstr &MI) {
2908 // No need to run this if neither bug is present.
2909 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2910 return false;
2911
2912 // If MI is an SMEM and it can corrupt vccz on this target, then we need
2913 // both to emit a waitcnt and to recompute vccz.
2914 // But we don't actually emit a waitcnt here. This is done in
2915 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
2916 // state, and can either skip emitting a waitcnt if there is already one in
2917 // the IR, or emit an "optimized" combined waitcnt.
2918 // If this is an smem read, it could complete and clobber vccz at any time.
2919 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
2920
2921 // If the target partial vcc writes don't update vccz, and MI is such an
2922 // instruction then we must recompute vccz.
2923 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
2924 // `definesRegister()` more than needed, because it's not very cheap.
2925 std::optional<bool> PartiallyWritesToVCCOpt;
2926 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
2927 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2928 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
2929 };
2930 if (VCCZNotUpdatedByPartialWrites) {
2931 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2932 // If this is a partial VCC write but won't update vccz, then we must
2933 // recompute vccz.
2934 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2935 }
2936
2937 // If MI is a vcc write with no pending smem, or there is a pending smem
2938 // but the target does not suffer from the vccz corruption bug, then we
2939 // don't need to recompute vccz as this write will recompute it anyway.
2940 if (!ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS) ||
2941 !VCCZCorruptionBug) {
2942 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
2943 if (!PartiallyWritesToVCCOpt)
2944 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2945 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2946 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
2947 // If we write to the full vcc or we write partially and the target
2948 // updates vccz on partial writes, then vccz will be updated correctly.
2949 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2950 *PartiallyWritesToVCCOpt);
2951 if (UpdatesVCCZ)
2952 MustRecomputeVCCZ = false;
2953 }
2954
2955 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
2956 // restore instruction if either is needed.
2957 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
2958 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
2959 // bit is updated, so we can restore the bit by reading the value of vcc
2960 // and then writing it back to the register.
2961 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2962 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2963 TRI.getVCC())
2964 .addReg(TRI.getVCC());
2965 MustRecomputeVCCZ = false;
2966 return true;
2967 }
2968 return false;
2969 }
2970};
2971
2972} // namespace
2973
2974// Generate s_waitcnt instructions where needed.
2975bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2976 MachineBasicBlock &Block,
2977 WaitcntBrackets &ScoreBrackets) {
2978 bool Modified = false;
2979
2980 LLVM_DEBUG({
2981 dbgs() << "*** Begin Block: ";
2982 Block.printName(dbgs());
2983 ScoreBrackets.dump();
2984 });
2985 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
2986
2987 // Walk over the instructions.
2988 MachineInstr *OldWaitcntInstr = nullptr;
2989
2990 // NOTE: We may append instrs after Inst while iterating.
2991 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2992 E = Block.instr_end();
2993 Iter != E; ++Iter) {
2994 MachineInstr &Inst = *Iter;
2995 if (isNonWaitcntMetaInst(Inst))
2996 continue;
2997 // Track pre-existing waitcnts that were added in earlier iterations or by
2998 // the memory legalizer.
2999 if (isWaitInstr(Inst) ||
3000 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3001 if (!OldWaitcntInstr)
3002 OldWaitcntInstr = &Inst;
3003 continue;
3004 }
3005
3006 PreheaderFlushFlags FlushFlags;
3007 if (Block.getFirstTerminator() == Inst)
3008 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3009
3010 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3011 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3012 FlushFlags);
3013 OldWaitcntInstr = nullptr;
3014
3015 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3016 // Asyncmarks record the current wait state and so should not allow
3017 // waitcnts that occur after them to be merged into waitcnts that occur
3018 // before.
3019 ScoreBrackets.recordAsyncMark(Inst);
3020 continue;
3021 }
3022
3023 if (TII.isSMRD(Inst)) {
3024 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3025 // No need to handle invariant loads when avoiding WAR conflicts, as
3026 // there cannot be a vector store to the same memory location.
3027 if (!Memop->isInvariant()) {
3028 const Value *Ptr = Memop->getValue();
3029 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3030 }
3031 }
3032 }
3033
3034 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3035
3036 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3037 // visited by the loop.
3038 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3039
3040 LLVM_DEBUG({
3041 Inst.print(dbgs());
3042 ScoreBrackets.dump();
3043 });
3044
3045 // If the target suffers from the vccz bugs, this may emit the necessary
3046 // vccz recompute instruction before \p Inst if needed.
3047 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3048 }
3049
3050 // Flush counters at the end of the block if needed (for preheaders with no
3051 // terminator).
3052 AMDGPU::Waitcnt Wait;
3053 if (Block.getFirstTerminator() == Block.end()) {
3054 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3055 if (FlushFlags.FlushVmCnt) {
3056 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3057 Wait.set(AMDGPU::LOAD_CNT, 0);
3058 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3059 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3060 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3061 Wait.set(AMDGPU::BVH_CNT, 0);
3062 }
3063 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3064 Wait.set(AMDGPU::DS_CNT, 0);
3065 }
3066
3067 // Combine or remove any redundant waitcnts at the end of the block.
3068 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3069 OldWaitcntInstr);
3070
3071 LLVM_DEBUG({
3072 dbgs() << "*** End Block: ";
3073 Block.printName(dbgs());
3074 ScoreBrackets.dump();
3075 });
3076
3077 return Modified;
3078}
3079
3080bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3081 if (Block.size() <= 1)
3082 return false;
3083 // The Memory Legalizer conservatively inserts a soft xcnt before each
3084 // atomic RMW operation. However, for sequences of back-to-back atomic
3085 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3086 // the redundant soft xcnts.
3087 bool Modified = false;
3088 // Remember the last atomic with a soft xcnt right before it.
3089 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3090
3091 for (MachineInstr &MI : drop_begin(Block)) {
3092 // Ignore last atomic if non-LDS VMEM and SMEM.
3093 bool IsLDS =
3094 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3095 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3096 LastAtomicWithSoftXcnt = nullptr;
3097
3098 bool IsAtomicRMW =
3099 SIInstrFlags::isMaybeAtomic(MI) && MI.mayLoad() && MI.mayStore();
3100 MachineInstr &PrevMI = *MI.getPrevNode();
3101 // This is an atomic with a soft xcnt.
3102 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3103 // If we have already found an atomic with a soft xcnt, remove this soft
3104 // xcnt as it's redundant.
3105 if (LastAtomicWithSoftXcnt) {
3106 PrevMI.eraseFromParent();
3107 Modified = true;
3108 }
3109 LastAtomicWithSoftXcnt = &MI;
3110 }
3111 }
3112 return Modified;
3113}
3114
3115// Return flags indicating which counters should be flushed in the preheader.
3116PreheaderFlushFlags
3117SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3118 const WaitcntBrackets &ScoreBrackets) {
3119 auto [Iterator, IsInserted] =
3120 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3121 if (!IsInserted)
3122 return Iterator->second;
3123
3124 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3125 if (!Succ)
3126 return PreheaderFlushFlags();
3127
3128 MachineLoop *Loop = MLI.getLoopFor(Succ);
3129 if (!Loop)
3130 return PreheaderFlushFlags();
3131
3132 if (Loop->getLoopPreheader() == &MBB) {
3133 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3134 return Iterator->second;
3135 }
3136
3137 return PreheaderFlushFlags();
3138}
3139
3140bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3142 return TII.mayAccessVMEMThroughFlat(MI);
3143 return SIInstrInfo::isVMEM(MI);
3144}
3145
3146bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3147 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3148}
3149
3150// Check if instruction is a store to LDS that is counted via DSCNT
3151// (where that counter exists).
3152bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3153 return MI.mayStore() && SIInstrInfo::isDS(MI);
3154}
3155
3156// Return flags indicating which counters should be flushed in the preheader of
3157// the given loop. We currently decide to flush in the following situations:
3158// For VMEM (FlushVmCnt):
3159// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3160// vgpr containing a value that is loaded outside of the loop. (Only on
3161// targets with no vscnt counter).
3162// 2. The loop contains vmem load(s), but the loaded values are not used in the
3163// loop, and at least one use of a vgpr containing a value that is loaded
3164// outside of the loop.
3165// For DS (FlushDsCnt, GFX12+ only):
3166// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3167// a value that is DS read outside of the loop.
3168// 4. The loop contains DS read(s), loaded values are not used in the same
3169// iteration but in the next iteration (prefetch pattern), and at least one
3170// use of a vgpr containing a value that is DS read outside of the loop.
3171// Flushing in preheader reduces wait overhead if the wait requirement in
3172// iteration 1 would otherwise be more strict (but unfortunately preheader
3173// flush decision is taken before knowing that).
3174// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3175// tracking. Some DS reads may be used in the same iteration (creating
3176// "flush points"), but others remain unflushed at the backedge. When a DS
3177// read is consumed in the same iteration, it and all prior reads are
3178// "flushed" (FIFO order). No DS writes are allowed in the loop.
3179// TODO: Find a way to extend to multi-block loops.
3180PreheaderFlushFlags
3181SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3182 const WaitcntBrackets &Brackets) {
3183 PreheaderFlushFlags Flags;
3184 bool HasVMemLoad = false;
3185 bool HasVMemStore = false;
3186 bool UsesVgprVMEMLoadedOutside = false;
3187 bool UsesVgprDSReadOutside = false;
3188 bool VMemInvalidated = false;
3189 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3190 // Tracking status for "no DS read in loop" or "pure DS prefetch
3191 // (use only in next iteration)".
3192 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3193 DenseSet<MCRegUnit> VgprUse;
3194 DenseSet<MCRegUnit> VgprDefVMEM;
3195 DenseSet<MCRegUnit> VgprDefDS;
3196
3197 // Track DS reads for prefetch pattern with flush points (single-block only).
3198 // Keeps track of the last DS read (position counted from the top of the loop)
3199 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3200 // the dest register has a use or is overwritten (by any later opertions).
3201 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3202 unsigned DSReadPosition = 0;
3203 bool IsSingleBlock = ML->getNumBlocks() == 1;
3204 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3205 unsigned LastDSFlushPosition = 0;
3206
3207 for (MachineBasicBlock *MBB : ML->blocks()) {
3208 for (MachineInstr &MI : *MBB) {
3209 if (isVMEMOrFlatVMEM(MI)) {
3210 HasVMemLoad |= MI.mayLoad();
3211 HasVMemStore |= MI.mayStore();
3212 }
3213 // TODO: Can we relax DSStore check? There may be cases where
3214 // these DS stores are drained prior to the end of MBB (or loop).
3215 if (mayStoreIncrementingDSCNT(MI)) {
3216 // Early exit if none of the optimizations are feasible.
3217 // Otherwise, set tracking status appropriately and continue.
3218 if (VMemInvalidated)
3219 return Flags;
3220 TrackSimpleDSOpt = false;
3221 TrackDSFlushPoint = false;
3222 }
3223 bool IsDSRead = isDSRead(MI);
3224 if (IsDSRead)
3225 ++DSReadPosition;
3226
3227 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3228 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3229 if (!TrackDSFlushPoint)
3230 return;
3231 if (auto It = LastDSReadPositionMap.find(RU);
3232 It != LastDSReadPositionMap.end()) {
3233 // RU defined by DSRead is used or overwritten. Need to complete
3234 // the read, if not already implied by a later DSRead (to any RU)
3235 // needing to complete in FIFO order.
3236 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3237 }
3238 };
3239
3240 for (const MachineOperand &Op : MI.all_uses()) {
3241 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3242 continue;
3243 // Vgpr use
3244 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3245 // If we find a register that is loaded inside the loop, 1. and 2.
3246 // are invalidated.
3247 if (VgprDefVMEM.contains(RU))
3248 VMemInvalidated = true;
3249
3250 // Check for DS reads used inside the loop
3251 if (VgprDefDS.contains(RU))
3252 TrackSimpleDSOpt = false;
3253
3254 // Early exit if all optimizations are invalidated
3255 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3256 return Flags;
3257
3258 // Check for flush points (DS read used in same iteration)
3259 updateDSReadFlushTracking(RU);
3260
3261 VgprUse.insert(RU);
3262 // Check if this register has a pending VMEM load from outside the
3263 // loop (value loaded outside and used inside).
3264 VMEMID ID = toVMEMID(RU);
3265 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3266 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3267 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3268 UsesVgprVMEMLoadedOutside = true;
3269 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3270 // Only consider it a DS read if there's no pending VMEM load for
3271 // this register, since FLAT can set both counters.
3272 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3273 UsesVgprDSReadOutside = true;
3274 }
3275 }
3276
3277 // VMem load vgpr def
3278 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3279 for (const MachineOperand &Op : MI.all_defs()) {
3280 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3281 // If we find a register that is loaded inside the loop, 1. and 2.
3282 // are invalidated.
3283 if (VgprUse.contains(RU))
3284 VMemInvalidated = true;
3285 VgprDefVMEM.insert(RU);
3286 }
3287 }
3288 // Early exit if all optimizations are invalidated
3289 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3290 return Flags;
3291 }
3292
3293 // DS read vgpr def
3294 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3295 // If USE comes before DEF, it's the prefetch pattern (use value from
3296 // previous iteration, read for next iteration). We should still flush
3297 // in preheader so iteration 1 doesn't need to wait inside the loop.
3298 // Only invalidate when DEF comes before USE (same-iteration consumption,
3299 // checked above when processing uses).
3300 if (IsDSRead || TrackDSFlushPoint) {
3301 for (const MachineOperand &Op : MI.all_defs()) {
3302 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3303 continue;
3304 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3305 // Check for overwrite of pending DS read (flush point) by any
3306 // instruction
3307 updateDSReadFlushTracking(RU);
3308 if (IsDSRead) {
3309 VgprDefDS.insert(RU);
3310 if (TrackDSFlushPoint)
3311 LastDSReadPositionMap[RU] = DSReadPosition;
3312 }
3313 }
3314 }
3315 }
3316 }
3317 }
3318
3319 // VMEM flush decision
3320 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3321 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3322 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3323 Flags.FlushVmCnt = true;
3324
3325 // DS flush decision:
3326 // Simple DS Opt: flush if loop uses DS read values from outside
3327 // and either has no DS reads in the loop, or DS reads whose results
3328 // are not used in the loop.
3329 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3330 // Prefetch with flush points: some DS reads used in same iteration,
3331 // but unflushed reads remain at backedge
3332 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3333 bool DSFlushPointPrefetch =
3334 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3335
3336 if (SimpleDSOpt || DSFlushPointPrefetch)
3337 Flags.FlushDsCnt = true;
3338
3339 return Flags;
3340}
3341
3342bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3343 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3344 auto &PDT =
3345 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3346 AliasAnalysis *AA = nullptr;
3347 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3348 AA = &AAR->getAAResults();
3349
3350 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3351}
3352
3353PreservedAnalyses
3356 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3357 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3359 .getManager()
3360 .getCachedResult<AAManager>(MF.getFunction());
3361
3362 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3363 return PreservedAnalyses::all();
3364
3367 .preserve<AAManager>();
3368}
3369
3370bool SIInsertWaitcnts::run() {
3372
3374
3375 // Initialize hardware limits first, as they're needed by the generators.
3376 Limits = AMDGPU::HardwareLimits(IV);
3377
3378 if (ST.hasExtendedWaitCounts()) {
3379 IsExpertMode = ST.hasExpertSchedulingMode() &&
3380 (ExpertSchedulingModeFlag.getNumOccurrences()
3382 : MF.getFunction()
3383 .getFnAttribute("amdgpu-expert-scheduling-mode")
3384 .getValueAsBool());
3385 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3387 // Initialize WCG per MF. It contains state that depends on MF attributes.
3388 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3389 IsExpertMode);
3390 } else {
3391 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3392 // Initialize WCG per MF. It contains state that depends on MF attributes.
3393 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3394 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3395 }
3396
3397 SmemAccessCounter = getCounterFromEvent(HWEvents::SMEM_ACCESS);
3398
3399 bool Modified = false;
3400
3401 MachineBasicBlock &EntryBB = MF.front();
3402
3403 if (!MFI->isEntryFunction() &&
3404 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3405 // Wait for any outstanding memory operations that the input registers may
3406 // depend on. We can't track them and it's better to do the wait after the
3407 // costly call sequence.
3408
3409 // TODO: Could insert earlier and schedule more liberally with operations
3410 // that only use caller preserved registers.
3412 while (I != EntryBB.end() && I->isMetaInstruction())
3413 ++I;
3414
3415 if (ST.hasExtendedWaitCounts()) {
3416 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3417 .addImm(0);
3419 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3420 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3422 continue;
3423
3424 if (!ST.hasImageInsts() &&
3425 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3426 CT == AMDGPU::BVH_CNT))
3427 continue;
3428
3429 BuildMI(EntryBB, I, DebugLoc(),
3430 TII.get(instrsForExtendedCounterTypes[CT]))
3431 .addImm(0);
3432 }
3433 if (IsExpertMode) {
3434 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3436 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3437 .addImm(Enc);
3438 }
3439 } else {
3440 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3441 }
3442
3443 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3444 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3445 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3446
3447 Modified = true;
3448 }
3449
3450 // Keep iterating over the blocks in reverse post order, inserting and
3451 // updating s_waitcnt where needed, until a fix point is reached.
3452 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3453 BlockInfos.try_emplace(MBB);
3454
3455 std::unique_ptr<WaitcntBrackets> Brackets;
3456 bool Repeat;
3457 do {
3458 Repeat = false;
3459
3460 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3461 ++BII) {
3462 MachineBasicBlock *MBB = BII->first;
3463 BlockInfo &BI = BII->second;
3464 if (!BI.Dirty)
3465 continue;
3466
3467 if (BI.Incoming) {
3468 if (!Brackets)
3469 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3470 else
3471 *Brackets = *BI.Incoming;
3472 } else {
3473 if (!Brackets) {
3474 Brackets = std::make_unique<WaitcntBrackets>(this);
3475 } else {
3476 // Reinitialize in-place. N.B. do not do this by assigning from a
3477 // temporary because the WaitcntBrackets class is large and it could
3478 // cause this function to use an unreasonable amount of stack space.
3479 Brackets->~WaitcntBrackets();
3480 new (Brackets.get()) WaitcntBrackets(this);
3481 }
3482 }
3483
3484 if (ST.hasWaitXcnt())
3485 Modified |= removeRedundantSoftXcnts(*MBB);
3486 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3487 BI.Dirty = false;
3488
3489 if (Brackets->hasPendingEvent()) {
3490 BlockInfo *MoveBracketsToSucc = nullptr;
3491 for (MachineBasicBlock *Succ : MBB->successors()) {
3492 auto *SuccBII = BlockInfos.find(Succ);
3493 BlockInfo &SuccBI = SuccBII->second;
3494 if (!SuccBI.Incoming) {
3495 SuccBI.Dirty = true;
3496 if (SuccBII <= BII) {
3497 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3498 Repeat = true;
3499 }
3500 if (!MoveBracketsToSucc) {
3501 MoveBracketsToSucc = &SuccBI;
3502 } else {
3503 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3504 }
3505 } else {
3506 LLVM_DEBUG({
3507 dbgs() << "Try to merge ";
3508 MBB->printName(dbgs());
3509 dbgs() << " into ";
3510 Succ->printName(dbgs());
3511 dbgs() << '\n';
3512 });
3513 if (SuccBI.Incoming->merge(*Brackets)) {
3514 SuccBI.Dirty = true;
3515 if (SuccBII <= BII) {
3516 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3517 Repeat = true;
3518 }
3519 }
3520 }
3521 }
3522 if (MoveBracketsToSucc)
3523 MoveBracketsToSucc->Incoming = std::move(Brackets);
3524 }
3525 }
3526 } while (Repeat);
3527
3528 if (ST.hasScalarStores()) {
3529 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3530 bool HaveScalarStores = false;
3531
3532 for (MachineBasicBlock &MBB : MF) {
3533 for (MachineInstr &MI : MBB) {
3534 if (!HaveScalarStores && TII.isScalarStore(MI))
3535 HaveScalarStores = true;
3536
3537 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3538 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3539 EndPgmBlocks.push_back(&MBB);
3540 }
3541 }
3542
3543 if (HaveScalarStores) {
3544 // If scalar writes are used, the cache must be flushed or else the next
3545 // wave to reuse the same scratch memory can be clobbered.
3546 //
3547 // Insert s_dcache_wb at wave termination points if there were any scalar
3548 // stores, and only if the cache hasn't already been flushed. This could
3549 // be improved by looking across blocks for flushes in postdominating
3550 // blocks from the stores but an explicitly requested flush is probably
3551 // very rare.
3552 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3553 bool SeenDCacheWB = false;
3554
3555 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3556 I != E; ++I) {
3557 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3558 SeenDCacheWB = true;
3559 else if (TII.isScalarStore(*I))
3560 SeenDCacheWB = false;
3561
3562 // FIXME: It would be better to insert this before a waitcnt if any.
3563 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3564 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3565 !SeenDCacheWB) {
3566 Modified = true;
3567 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3568 }
3569 }
3570 }
3571 }
3572 }
3573
3574 if (IsExpertMode) {
3575 // Enable expert scheduling on function entry. To satisfy ABI requirements
3576 // and to allow calls between function with different expert scheduling
3577 // settings, disable it around calls and before returns.
3578
3580 while (I != EntryBB.end() && I->isMetaInstruction())
3581 ++I;
3582 setSchedulingMode(EntryBB, I, true);
3583
3584 for (MachineInstr *MI : CallInsts) {
3585 MachineBasicBlock &MBB = *MI->getParent();
3586 setSchedulingMode(MBB, MI, false);
3587 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3588 }
3589
3590 for (MachineInstr *MI : ReturnInsts)
3591 setSchedulingMode(*MI->getParent(), MI, false);
3592
3593 Modified = true;
3594 }
3595
3596 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3597 // This is done in different ways depending on how the VGPRs were allocated
3598 // (i.e. whether we're in dynamic VGPR mode or not).
3599 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3600 // waveslot limited kernel runs slower with the deallocation.
3601 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3602 for (auto [MI, _] : EndPgmInsts) {
3603 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3604 TII.get(AMDGPU::S_ALLOC_VGPR))
3605 .addImm(0);
3606 Modified = true;
3607 }
3608 } else if (!WCG->isOptNone() &&
3609 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3610 (MF.getFrameInfo().hasCalls() ||
3611 ST.getOccupancyWithNumVGPRs(
3612 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3613 /*IsDynamicVGPR=*/false) <
3615 for (auto [MI, Flag] : EndPgmInsts) {
3616 if (Flag) {
3617 if (ST.requiresNopBeforeDeallocVGPRs()) {
3618 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3619 TII.get(AMDGPU::S_NOP))
3620 .addImm(0);
3621 }
3622 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3623 TII.get(AMDGPU::S_SENDMSG))
3625 Modified = true;
3626 }
3627 }
3628 }
3629
3630 return Modified;
3631}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
AMDGPU::HWEvents HWEvents
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Bit mask of hardware events.
constexpr unsigned size() const
constexpr bool contains(HWEvents Other) const
constexpr bool any() const
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:758
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:723
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
HWEvents getSimplifiedVMEMEventsFor(const MachineInstr &Inst, const SIInstrInfo &TII)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
HWEvents getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
constexpr bool isMaybeAtomic(const T &...O)
Definition SIDefines.h:315
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.