LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUHWEvents.h"
28#include "AMDGPUWaitcntUtils.h"
29#include "GCNSubtarget.h"
33#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/Sequence.h"
41#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
48
49#define DEBUG_TYPE "si-insert-waitcnts"
50
51static cl::opt<bool>
52 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55 cl::init(false), cl::Hidden);
56
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc("Force all waitcnt load counters to wait until 0"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-expert-scheduling-mode",
64 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
65 cl::init(false), cl::Hidden);
66
67namespace {
68
69template <typename EmitWaitcntFn>
70static void EmitExpandedWaitcnt(unsigned Outstanding, unsigned Target,
71 EmitWaitcntFn &&EmitWaitcnt) {
72 // Emit waitcnts from (Outstanding - 1) down to Target.
73 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
74 EmitWaitcnt(I);
75 EmitWaitcnt(Target);
76}
77
78/// Integer IDs used to track vector memory locations we may have to wait on.
79/// Encoded as u16 chunks:
80///
81/// [0, REGUNITS_END ): MCRegUnit
82/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
83///
84/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
85/// It gives (2 << 16) - 1 entries per category which is more than enough
86/// for all register units. MCPhysReg is u16 so we don't even support >u16
87/// physical register numbers at this time, let alone >u16 register units.
88/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
89/// is enough for all register units.
90using VMEMID = uint32_t;
91
92enum : VMEMID {
93 TRACKINGID_RANGE_LEN = (1 << 16),
94
95 // Important: MCRegUnits must always be tracked starting from 0, as we
96 // need to be able to convert between a MCRegUnit and a VMEMID freely.
97 REGUNITS_BEGIN = 0,
98 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
99
100 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
101 // entry, which is updated for all LDS DMA operations encountered.
102 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
103 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
104 LDSDMA_BEGIN = REGUNITS_END,
105 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
106};
107
108/// Convert a MCRegUnit to a VMEMID.
109static constexpr VMEMID toVMEMID(MCRegUnit RU) {
110 return static_cast<unsigned>(RU);
111}
112
113} // namespace
114
115namespace {
116
117// Maps values of InstCounterType to the instruction that waits on that
118// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
119// returns true, and does not cover VA_VDST or VM_VSRC.
120static const unsigned
121 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
122 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
123 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
124 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
125 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
126 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
127
128// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
129// code but still need to be processed by this pass for async vmcnt tracking.
130static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
131 switch (MI.getOpcode()) {
132 case AMDGPU::ASYNCMARK:
133 case AMDGPU::WAIT_ASYNCMARK:
134 return false;
135 default:
136 return MI.isMetaInstruction();
137 }
138}
139
140static bool updateVMCntOnly(const MachineInstr &Inst) {
141 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
143}
144
145#ifndef NDEBUG
146static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
147 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
148}
149#endif // NDEBUG
150
151class WaitcntBrackets;
152
153// This abstracts the logic for generating and updating S_WAIT* instructions
154// away from the analysis that determines where they are needed. This was
155// done because the set of counters and instructions for waiting on them
156// underwent a major shift with gfx12, sufficiently so that having this
157// abstraction allows the main analysis logic to be simpler than it would
158// otherwise have had to become.
159class WaitcntGenerator {
160protected:
161 const GCNSubtarget &ST;
162 const SIInstrInfo &TII;
163 AMDGPU::IsaVersion IV;
164 AMDGPU::InstCounterType MaxCounter;
165 bool OptNone;
166 bool ExpandWaitcntProfiling = false;
167 const AMDGPU::HardwareLimits &Limits;
168
169public:
170 WaitcntGenerator() = delete;
171 WaitcntGenerator(const WaitcntGenerator &) = delete;
172 WaitcntGenerator(const MachineFunction &MF,
173 AMDGPU::InstCounterType MaxCounter,
174 const AMDGPU::HardwareLimits &Limits)
175 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
176 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
177 OptNone(MF.getFunction().hasOptNone() ||
178 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
179 ExpandWaitcntProfiling(
180 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
181 Limits(Limits) {}
182
183 // Return true if the current function should be compiled with no
184 // optimization.
185 bool isOptNone() const { return OptNone; }
186
187 unsigned getLimit(AMDGPU::InstCounterType E) const { return Limits.get(E); }
188
189 // Edits an existing sequence of wait count instructions according
190 // to an incoming Waitcnt value, which is itself updated to reflect
191 // any new wait count instructions which may need to be generated by
192 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
193 // were made.
194 //
195 // This editing will usually be merely updated operands, but it may also
196 // delete instructions if the incoming Wait value indicates they are not
197 // needed. It may also remove existing instructions for which a wait
198 // is needed if it can be determined that it is better to generate new
199 // instructions later, as can happen on gfx12.
200 virtual bool
201 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
202 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
204
205 // Transform a soft waitcnt into a normal one.
206 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
207
208 // Generates new wait count instructions according to the value of
209 // Wait, returning true if any new instructions were created.
210 // ScoreBrackets is used for profiling expansion.
211 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
213 AMDGPU::Waitcnt Wait,
214 const WaitcntBrackets &ScoreBrackets) = 0;
215
216 // Returns the set of HWEvents that corresponds to counter \p T.
217 virtual HWEvents getWaitEvents(AMDGPU::InstCounterType T) const = 0;
218
219 /// \returns the counter that corresponds to event \p E.
220 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
221 assert(E.size() == 1 && "Cannot handle a mask of events!");
222 for (auto T : AMDGPU::inst_counter_types()) {
223 if (getWaitEvents(T) & E)
224 return T;
225 }
226 llvm_unreachable("event type has no associated counter");
227 }
228
229 // Returns a new waitcnt with all counters except VScnt set to 0. If
230 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
231 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
232 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
233 // processed.
234 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
235
236 virtual ~WaitcntGenerator() = default;
237};
238
239class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
240 static constexpr const HWEvents
241 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
242 HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
243 HWEvents::VMEM_BVH_READ_ACCESS,
244 HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
245 HWEvents::SQ_MESSAGE,
246 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
247 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
248 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
249 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
258
259public:
260 using WaitcntGenerator::WaitcntGenerator;
261 bool
262 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
263 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
264 MachineBasicBlock::instr_iterator It) const override;
265
266 bool createNewWaitcnt(MachineBasicBlock &Block,
268 AMDGPU::Waitcnt Wait,
269 const WaitcntBrackets &ScoreBrackets) override;
270
271 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
272 HWEvents EVs = WaitEventMaskForInstPreGFX12[T];
273 if (T == AMDGPU::LOAD_CNT && !ST.hasVscnt())
274 EVs |= WaitEventMaskForInstPreGFX12[AMDGPU::STORE_CNT];
275 return EVs;
276 }
277
278 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
279};
280
281class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
282protected:
283 bool IsExpertMode;
284 static constexpr const HWEvents
285 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
286 HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
287 HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
288 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
289 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
290 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
291
292 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
293 HWEvents::VMEM_SAMPLER_READ_ACCESS,
294 HWEvents::VMEM_BVH_READ_ACCESS,
295
296 HWEvents::SMEM_ACCESS | HWEvents::SQ_MESSAGE | HWEvents::SCC_WRITE,
297 HWEvents::VMEM_GROUP | HWEvents::SMEM_GROUP,
298 HWEvents::ASYNC_ACCESS,
299 HWEvents::TENSOR_ACCESS,
300 HWEvents::VGPR_CSMACC_WRITE | HWEvents::VGPR_DPMACC_WRITE |
301 HWEvents::VGPR_TRANS_WRITE | HWEvents::VGPR_XDL_WRITE,
302 HWEvents::VGPR_LDS_READ | HWEvents::VGPR_FLAT_READ |
303 HWEvents::VGPR_VMEM_READ};
304
305public:
306 WaitcntGeneratorGFX12Plus() = delete;
307 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
308 AMDGPU::InstCounterType MaxCounter,
309 const AMDGPU::HardwareLimits &Limits,
310 bool IsExpertMode)
311 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
312
313 bool
314 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
315 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
316 MachineBasicBlock::instr_iterator It) const override;
317
318 bool createNewWaitcnt(MachineBasicBlock &Block,
320 AMDGPU::Waitcnt Wait,
321 const WaitcntBrackets &ScoreBrackets) override;
322
323 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
324 return WaitEventMaskForInstGFX12Plus[T];
325 }
326
327 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
328};
329
330// Flags indicating which counters should be flushed in a loop preheader.
331struct PreheaderFlushFlags {
332 bool FlushVmCnt = false;
333 bool FlushDsCnt = false;
334};
335
336class SIInsertWaitcnts {
337 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
338 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
339 MachineLoopInfo &MLI;
340 MachinePostDominatorTree &PDT;
341 AliasAnalysis *AA = nullptr;
342 MachineFunction &MF;
343
344 struct BlockInfo {
345 std::unique_ptr<WaitcntBrackets> Incoming;
346 bool Dirty = true;
347 BlockInfo() = default;
348 BlockInfo(BlockInfo &&) = default;
349 BlockInfo &operator=(BlockInfo &&) = default;
350 ~BlockInfo();
351 };
352
353 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
354
355 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
356
357 std::unique_ptr<WaitcntGenerator> WCG;
358
359 // Remember call and return instructions in the function.
360 DenseSet<MachineInstr *> CallInsts;
361 DenseSet<MachineInstr *> ReturnInsts;
362
363 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
364 // be outstanding stores but definitely no outstanding scratch stores, to help
365 // with insertion of DEALLOC_VGPRS messages.
366 DenseMap<MachineInstr *, bool> EndPgmInsts;
367
368 AMDGPU::HardwareLimits Limits;
369
370public:
371 const GCNSubtarget &ST;
372 const SIInstrInfo &TII;
373 const SIRegisterInfo &TRI;
374 const MachineRegisterInfo &MRI;
375 AMDGPU::InstCounterType SmemAccessCounter;
376 AMDGPU::InstCounterType MaxCounter;
377 bool IsExpertMode = false;
378
379 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
380 AliasAnalysis *AA, MachineFunction &MF)
381 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
382 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
383 MRI(MF.getRegInfo()) {}
384
385 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
386
387 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
388 const WaitcntBrackets &Brackets);
389 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
390 const WaitcntBrackets &ScoreBrackets);
391 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
392 bool isDSRead(const MachineInstr &MI) const;
393 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
394 bool run();
395
396 bool isAsync(const MachineInstr &MI) const {
398 return false;
400 return true;
401 const MachineOperand *Async =
402 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
403 return Async && (Async->getImm());
404 }
405
406 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
407 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
408 }
409
410 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
411 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
412 }
413
414 bool shouldUpdateAsyncMark(const MachineInstr &MI,
417 return T == AMDGPU::TENSOR_CNT;
418 if (!isAsyncLdsDmaWrite(MI))
419 return false;
421 return T == AMDGPU::ASYNC_CNT;
422 return T == AMDGPU::LOAD_CNT;
423 }
424
425 bool isVmemAccess(const MachineInstr &MI) const;
426 bool generateWaitcntInstBefore(MachineInstr &MI,
427 WaitcntBrackets &ScoreBrackets,
428 MachineInstr *OldWaitcntInstr,
429 PreheaderFlushFlags FlushFlags);
430 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
432 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
433 MachineInstr *OldWaitcntInstr);
434 void updateEventWaitcntAfter(MachineInstr &Inst,
435 WaitcntBrackets *ScoreBrackets);
436 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
437 MachineBasicBlock *Block) const;
438 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
439 WaitcntBrackets &ScoreBrackets);
440 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
441 WaitcntBrackets &ScoreBrackets);
442 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
443 /// Legalizer. Returns true if block was modified.
444 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
445 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
446 bool ExpertMode) const;
447 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const {
448 return WCG->getWaitEvents(T);
449 }
450 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
451 return WCG->getCounterFromEvent(E);
452 }
453};
454
455// This objects maintains the current score brackets of each wait counter, and
456// a per-register scoreboard for each wait counter.
457//
458// We also maintain the latest score for every event type that can change the
459// waitcnt in order to know if there are multiple types of events within
460// the brackets. When multiple types of event happen in the bracket,
461// wait count may get decreased out of order, therefore we need to put in
462// "s_waitcnt 0" before use.
463class WaitcntBrackets {
464public:
465 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
466 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
467 }
468
469#ifndef NDEBUG
470 ~WaitcntBrackets() {
471 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
472 for (auto &[ID, Val] : VMem) {
473 if (Val.empty())
474 ++NumUnusedVmem;
475 }
476 for (auto &[ID, Val] : SGPRs) {
477 if (Val.empty())
478 ++NumUnusedSGPRs;
479 }
480
481 if (NumUnusedVmem || NumUnusedSGPRs) {
482 errs() << "WaitcntBracket had unused entries at destruction time: "
483 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
484 << " SGPR unused entries\n";
485 std::abort();
486 }
487 }
488#endif
489
490 bool isSmemCounter(AMDGPU::InstCounterType T) const {
491 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
492 }
493
494 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
495 return ScoreUBs[T] - ScoreLBs[T];
496 }
497
498 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
499 return getVMemScore(ID, T) > getScoreLB(T);
500 }
501
502 /// \Return true if we have no score entries for counter \p T.
503 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
504
505private:
506 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
508 return ScoreLBs[T];
509 }
510
511 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
513 return ScoreUBs[T];
514 }
515
516 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
517 return getScoreUB(T) - getScoreLB(T);
518 }
519
520 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
521 auto It = SGPRs.find(RU);
522 return It != SGPRs.end() ? It->second.get(T) : 0;
523 }
524
525 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
526 auto It = VMem.find(TID);
527 return It != VMem.end() ? It->second.Scores[T] : 0;
528 }
529
530public:
531 bool merge(const WaitcntBrackets &Other);
532
533 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
534 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
535 simplifyWaitcnt(Wait, Wait);
536 }
537 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
538 AMDGPU::Waitcnt &UpdateWait) const;
539 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
540 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
541 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
542 AMDGPU::Waitcnt &UpdateWait) const;
543 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
544 AMDGPU::Waitcnt &UpdateWait) const;
545
546 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
547 AMDGPU::Waitcnt &Wait,
548 const MachineInstr &MI) const;
549 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
551 MCPhysReg Reg) const;
552 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
553 AMDGPU::Waitcnt &Wait) const;
554 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
555 void tryClearSCCWriteEvent(MachineInstr *Inst);
556
557 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
558 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
559 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
560 void updateByEvent(HWEvents E, MachineInstr &MI);
561 void recordAsyncMark(MachineInstr &MI);
562
563 HWEvents getPendingEvents() const { return PendingEvents; }
564 bool hasPendingEvent() const { return PendingEvents.any(); }
565 bool hasPendingEvent(HWEvents E) const { return PendingEvents.contains(E); }
566 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
567 bool HasPending = (PendingEvents & Context->getWaitEvents(T)).any();
568 assert(HasPending == !empty(T) &&
569 "Expected pending events iff scoreboard is not empty");
570 return HasPending;
571 }
572
573 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
574 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
575 // Return true if more than one bit is set in Events.
576 return Events.size() > 1;
577 }
578
579 bool hasPendingFlat() const {
580 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
581 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
582 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
583 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
584 }
585
586 void setPendingFlat() {
587 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
588 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
589 }
590
591 bool hasPendingGDS() const {
592 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
593 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
594 }
595
596 unsigned getPendingGDSWait() const {
597 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
598 getLimit(AMDGPU::DS_CNT) - 1);
599 }
600
601 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
602
603 // Return true if there might be pending writes to the vgpr-interval by VMEM
604 // instructions where the HWEvents in VGPRContext are not contained in E.
605 bool hasDifferentVGPRPendingEvents(MCPhysReg Reg, HWEvents E) const {
606 for (MCRegUnit RU : regunits(Reg)) {
607 auto It = VMem.find(toVMEMID(RU));
608 if (It != VMem.end() && (It->second.VGPRPendingEvents & ~E).any())
609 return true;
610 }
611 return false;
612 }
613
614 void clearVGPRPendingEvents(MCPhysReg Reg) {
615 for (MCRegUnit RU : regunits(Reg)) {
616 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
617 It->second.VGPRPendingEvents = HWEvents::NONE;
618 if (It->second.empty())
619 VMem.erase(It);
620 }
621 }
622 }
623
624 void setStateOnFunctionEntryOrReturn() {
625 setScoreUB(AMDGPU::STORE_CNT,
626 getScoreUB(AMDGPU::STORE_CNT) + getLimit(AMDGPU::STORE_CNT));
627 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
628 }
629
630 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
631 return LDSDMAStores;
632 }
633
634 bool hasPointSampleAccel(const MachineInstr &MI) const;
635 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
636 MCPhysReg RU) const;
637
638 void print(raw_ostream &) const;
639 void dump() const { print(dbgs()); }
640
641 // Free up memory by removing empty entries from the DenseMap that track event
642 // scores.
643 void purgeEmptyTrackingData();
644
645private:
646 unsigned getLimit(AMDGPU::InstCounterType T) const {
647 return Context->getLimits().get(T);
648 }
649
650 struct MergeInfo {
651 unsigned OldLB;
652 unsigned OtherLB;
653 unsigned MyShift;
654 unsigned OtherShift;
655 };
656
657 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
658
659 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
660 AMDGPU::Waitcnt &Wait) const;
661
662 static bool mergeScore(const MergeInfo &M, unsigned &Score,
663 unsigned OtherScore);
664 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
665 ArrayRef<CounterValueArray> OtherMarks);
666
668 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
669 if (!Context->TRI.isInAllocatableClass(Reg))
670 return {{}, {}};
671 return Context->TRI.regunits(Reg);
672 }
673
674 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
676 ScoreLBs[T] = Val;
677 }
678
679 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
681 ScoreUBs[T] = Val;
682
683 if (T != AMDGPU::EXP_CNT)
684 return;
685
686 if (getScoreRange(AMDGPU::EXP_CNT) > getLimit(AMDGPU::EXP_CNT))
687 ScoreLBs[AMDGPU::EXP_CNT] =
688 ScoreUBs[AMDGPU::EXP_CNT] - getLimit(AMDGPU::EXP_CNT);
689 }
690
691 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
692 const SIRegisterInfo &TRI = Context->TRI;
693 if (Reg == AMDGPU::SCC) {
694 SCCScore = Val;
695 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
696 for (MCRegUnit RU : regunits(Reg))
697 VMem[toVMEMID(RU)].Scores[T] = Val;
698 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
699 for (MCRegUnit RU : regunits(Reg))
700 SGPRs[RU].get(T) = Val;
701 } else {
702 llvm_unreachable("Register cannot be tracked/unknown register!");
703 }
704 }
705
706 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
707 VMem[TID].Scores[T] = Val;
708 }
709
710 void setScoreByOperand(const MachineOperand &Op,
711 AMDGPU::InstCounterType CntTy, unsigned Val);
712
713 const SIInsertWaitcnts *Context;
714
715 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
716 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
717 HWEvents PendingEvents;
718 // Remember the last flat memory operation.
719 unsigned LastFlatDsCnt = 0;
720 unsigned LastFlatLoadCnt = 0;
721 // Remember the last GDS operation.
722 unsigned LastGDS = 0;
723
724 // The score tracking logic is fragmented as follows:
725 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
726 // - SGPRs: SGPR RegUnits
727 // - SCC: Non-allocatable and not general purpose: not a SGPR.
728 //
729 // For the VMem case, if the key is within the range of LDS DMA IDs,
730 // then the corresponding index into the `LDSDMAStores` vector below is:
731 // Key - LDSDMA_BEGIN - 1
732 // This is because LDSDMA_BEGIN is a generic entry and does not have an
733 // associated MachineInstr.
734 //
735 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
736
737 struct VMEMInfo {
738 // Scores for all instruction counters. Zero-initialized.
739 CounterValueArray Scores{};
740 // For VGPRs, we need to track an additional fine-grained set of pending
741 // events.
742 HWEvents VGPRPendingEvents;
743
744 bool empty() const {
745 return all_of(Scores, equal_to(0)) && !VGPRPendingEvents;
746 }
747 };
748
749 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
750 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
751 class SGPRInfo {
752 /// Either DS_CNT or KM_CNT score.
753 unsigned ScoreDsKmCnt = 0;
754 unsigned ScoreXCnt = 0;
755
756 public:
757 unsigned get(AMDGPU::InstCounterType T) const {
758 assert(
759 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
760 "Invalid counter");
761 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
762 }
763 unsigned &get(AMDGPU::InstCounterType T) {
764 assert(
765 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
766 "Invalid counter");
767 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
768 }
769
770 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
771 };
772
773 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
774 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
775
776 // Reg score for SCC.
777 unsigned SCCScore = 0;
778 // The unique instruction that has an SCC write pending, if there is one.
779 const MachineInstr *PendingSCCWrite = nullptr;
780
781 // Store representative LDS DMA operations. The only useful info here is
782 // alias info. One store is kept per unique AAInfo.
783 SmallVector<const MachineInstr *> LDSDMAStores;
784
785 // State of all counters at each async mark encountered so far.
787
788 // But in the rare pathological case, a nest of loops that pushes marks
789 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
790 // it to a reasonable limit. We can tune this later or potentially introduce a
791 // user option to control the value.
792 static constexpr unsigned MaxAsyncMarks = 16;
793
794 // Track the upper bound score for async operations that are not part of a
795 // mark yet. Initialized to all zeros.
796 CounterValueArray AsyncScore{};
797};
798
799SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
800
801class SIInsertWaitcntsLegacy : public MachineFunctionPass {
802public:
803 static char ID;
804 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
805
806 bool runOnMachineFunction(MachineFunction &MF) override;
807
808 StringRef getPassName() const override {
809 return "SI insert wait instructions";
810 }
811
812 void getAnalysisUsage(AnalysisUsage &AU) const override {
813 AU.setPreservesCFG();
814 AU.addRequired<MachineLoopInfoWrapperPass>();
815 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
816 AU.addUsedIfAvailable<AAResultsWrapperPass>();
817 AU.addPreserved<AAResultsWrapperPass>();
819 }
820};
821
822} // end anonymous namespace
823
824void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
826 unsigned Score) {
827 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
828}
829
830// Return true if the subtarget is one that enables Point Sample Acceleration
831// and the MachineInstr passed in is one to which it might be applied (the
832// hardware makes this decision based on several factors, but we can't determine
833// this at compile time, so we have to assume it might be applied if the
834// instruction supports it).
835bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
836 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
837 return false;
838
839 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
840 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
842 return BaseInfo->PointSampleAccel;
843}
844
845// Return true if the subtarget enables Point Sample Acceleration, the supplied
846// MachineInstr is one to which it might be applied and the supplied interval is
847// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
848// (this is the type that a point sample accelerated instruction effectively
849// becomes)
850bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
851 MCPhysReg Reg) const {
852 if (!hasPointSampleAccel(MI))
853 return false;
854
855 return hasDifferentVGPRPendingEvents(Reg, HWEvents::VMEM_READ_ACCESS);
856}
857
858void WaitcntBrackets::updateByEvent(HWEvents E, MachineInstr &Inst) {
859 assert(E.size() == 1 && "Expected singular event!");
860 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
861 assert(T < Context->MaxCounter);
862
863 unsigned UB = getScoreUB(T);
864 unsigned Increment = 1;
866 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
867 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
868 // two VOP3P instructions and increments VA_VDST twice.
869 Increment = 2;
870 }
871 unsigned CurrScore = UB + Increment;
872 if (CurrScore == 0)
873 report_fatal_error("InsertWaitcnt score wraparound");
874 // PendingEvents and ScoreUB need to be update regardless if this event
875 // changes the score of a register or not.
876 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
877 PendingEvents |= E;
878 setScoreUB(T, CurrScore);
879
880 const SIRegisterInfo &TRI = Context->TRI;
881 const MachineRegisterInfo &MRI = Context->MRI;
882 const SIInstrInfo &TII = Context->TII;
883
884 if (T == AMDGPU::EXP_CNT) {
885 // Put score on the source vgprs. If this is a store, just use those
886 // specific register(s).
887 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
888 // All GDS operations must protect their address register (same as
889 // export.)
890 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
891 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
892
893 if (Inst.mayStore()) {
894 if (const auto *Data0 =
895 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
896 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
897 if (const auto *Data1 =
898 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
899 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
900 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
901 Inst.getOpcode() != AMDGPU::DS_APPEND &&
902 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
903 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
904 for (const MachineOperand &Op : Inst.all_uses()) {
905 if (TRI.isVectorRegister(MRI, Op.getReg()))
906 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
907 }
908 }
909 } else if (TII.isFLAT(Inst)) {
910 if (Inst.mayStore()) {
911 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
912 AMDGPU::EXP_CNT, CurrScore);
913 } else if (SIInstrInfo::isAtomicRet(Inst)) {
914 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
915 AMDGPU::EXP_CNT, CurrScore);
916 }
917 } else if (TII.isMIMG(Inst)) {
918 if (Inst.mayStore()) {
919 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
920 } else if (SIInstrInfo::isAtomicRet(Inst)) {
921 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
922 AMDGPU::EXP_CNT, CurrScore);
923 }
924 } else if (TII.isMTBUF(Inst)) {
925 if (Inst.mayStore())
926 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
927 } else if (TII.isMUBUF(Inst)) {
928 if (Inst.mayStore()) {
929 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
930 } else if (SIInstrInfo::isAtomicRet(Inst)) {
931 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
932 AMDGPU::EXP_CNT, CurrScore);
933 }
934 } else if (TII.isLDSDIR(Inst)) {
935 // LDSDIR instructions attach the score to the destination.
936 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
937 AMDGPU::EXP_CNT, CurrScore);
938 } else {
939 if (TII.isEXP(Inst)) {
940 // For export the destination registers are really temps that
941 // can be used as the actual source after export patching, so
942 // we need to treat them like sources and set the EXP_CNT
943 // score.
944 for (MachineOperand &DefMO : Inst.all_defs()) {
945 if (TRI.isVGPR(MRI, DefMO.getReg())) {
946 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
947 }
948 }
949 }
950 for (const MachineOperand &Op : Inst.all_uses()) {
951 if (TRI.isVectorRegister(MRI, Op.getReg()))
952 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
953 }
954 }
955 } else if (T == AMDGPU::X_CNT) {
956 HWEvents OtherEvent =
957 E == HWEvents::SMEM_GROUP ? HWEvents::VMEM_GROUP : HWEvents::SMEM_GROUP;
958 if (PendingEvents.contains(OtherEvent)) {
959 // Hardware inserts an implicit xcnt between interleaved
960 // SMEM and VMEM operations. So there will never be
961 // outstanding address translations for both SMEM and
962 // VMEM at the same time.
963 setScoreLB(T, getScoreUB(T) - 1);
964 PendingEvents -= OtherEvent;
965 }
966 for (const MachineOperand &Op : Inst.all_uses())
967 setScoreByOperand(Op, T, CurrScore);
968 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
969 // Match the score to the VGPR destination or source registers as
970 // appropriate
971 for (const MachineOperand &Op : Inst.operands()) {
972 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
973 (T == AMDGPU::VM_VSRC && Op.isDef()))
974 continue;
975 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
976 setScoreByOperand(Op, T, CurrScore);
977 }
978 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
979 // Match the score to the destination registers.
980 //
981 // Check only explicit operands. Stores, especially spill stores, include
982 // implicit uses and defs of their super registers which would create an
983 // artificial dependency, while these are there only for register liveness
984 // accounting purposes.
985 //
986 // Special cases where implicit register defs exists, such as M0 or VCC,
987 // but none with memory instructions.
988 for (const MachineOperand &Op : Inst.defs()) {
989 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
990 T == AMDGPU::BVH_CNT) {
991 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
992 continue;
993 if (updateVMCntOnly(Inst)) {
994 // updateVMCntOnly should only leave us with VGPRs
995 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
996 // defs.
997 assert(TRI.isVectorRegister(MRI, Op.getReg()));
998 HWEvents VGPRContext =
1000 // If instruction can have Point Sample Accel applied, we have to flag
1001 // this with another potential dependency
1002 if (hasPointSampleAccel(Inst))
1003 VGPRContext |= HWEvents::VMEM_READ_ACCESS;
1004 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1005 VMem[toVMEMID(RU)].VGPRPendingEvents |= VGPRContext;
1006 }
1007 }
1008 setScoreByOperand(Op, T, CurrScore);
1009 }
1010 if (Inst.mayStore() &&
1011 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1012 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1013 // written can be accessed. A load from LDS to VMEM does not need a wait.
1014 //
1015 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1016 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1017 // store. The "Slot" is the index into LDSDMAStores + 1.
1018 unsigned Slot = 0;
1019 for (const auto *MemOp : Inst.memoperands()) {
1020 if (!MemOp->isStore() ||
1021 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1022 continue;
1023 // Comparing just AA info does not guarantee memoperands are equal
1024 // in general, but this is so for LDS DMA in practice.
1025 auto AAI = MemOp->getAAInfo();
1026 // Alias scope information gives a way to definitely identify an
1027 // original memory object and practically produced in the module LDS
1028 // lowering pass. If there is no scope available we will not be able
1029 // to disambiguate LDS aliasing as after the module lowering all LDS
1030 // is squashed into a single big object.
1031 if (!AAI || !AAI.Scope)
1032 break;
1033 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1034 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1035 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1036 Slot = I + 1;
1037 break;
1038 }
1039 }
1040 }
1041 if (Slot)
1042 break;
1043 // The slot may not be valid because it can be >= NUM_LDSDMA which
1044 // means the scoreboard cannot track it. We still want to preserve the
1045 // MI in order to check alias information, though.
1046 LDSDMAStores.push_back(&Inst);
1047 Slot = LDSDMAStores.size();
1048 break;
1049 }
1050 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1051 if (Slot && Slot < NUM_LDSDMA)
1052 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1053 }
1054
1055 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1056 AsyncScore[T] = CurrScore;
1057 }
1058
1060 setRegScore(AMDGPU::SCC, T, CurrScore);
1061 PendingSCCWrite = &Inst;
1062 }
1063 }
1064}
1065
1066void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1067 // In the absence of loops, AsyncMarks can grow linearly with the program
1068 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1069 // limit every time we push a new mark, but that seems like unnecessary work
1070 // in practical cases. We do separately truncate the array when processing a
1071 // loop, which should be sufficient.
1072 AsyncMarks.push_back(AsyncScore);
1073 AsyncScore = {};
1074 LLVM_DEBUG({
1075 dbgs() << "recordAsyncMark:\n" << Inst;
1076 for (const auto &Mark : AsyncMarks) {
1077 llvm::interleaveComma(Mark, dbgs());
1078 dbgs() << '\n';
1079 }
1080 });
1081}
1082
1083void WaitcntBrackets::print(raw_ostream &OS) const {
1084 const GCNSubtarget &ST = Context->ST;
1085
1086 for (auto T : inst_counter_types(Context->MaxCounter)) {
1087 unsigned SR = getScoreRange(T);
1088 switch (T) {
1089 case AMDGPU::LOAD_CNT:
1090 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1091 << SR << "):";
1092 break;
1093 case AMDGPU::DS_CNT:
1094 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1095 << SR << "):";
1096 break;
1097 case AMDGPU::EXP_CNT:
1098 OS << " EXP_CNT(" << SR << "):";
1099 break;
1100 case AMDGPU::STORE_CNT:
1101 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1102 << SR << "):";
1103 break;
1104 case AMDGPU::SAMPLE_CNT:
1105 OS << " SAMPLE_CNT(" << SR << "):";
1106 break;
1107 case AMDGPU::BVH_CNT:
1108 OS << " BVH_CNT(" << SR << "):";
1109 break;
1110 case AMDGPU::KM_CNT:
1111 OS << " KM_CNT(" << SR << "):";
1112 break;
1113 case AMDGPU::X_CNT:
1114 OS << " X_CNT(" << SR << "):";
1115 break;
1116 case AMDGPU::ASYNC_CNT:
1117 OS << " ASYNC_CNT(" << SR << "):";
1118 break;
1119 case AMDGPU::VA_VDST:
1120 OS << " VA_VDST(" << SR << "): ";
1121 break;
1122 case AMDGPU::VM_VSRC:
1123 OS << " VM_VSRC(" << SR << "): ";
1124 break;
1125 default:
1126 OS << " UNKNOWN(" << SR << "):";
1127 break;
1128 }
1129
1130 if (SR != 0) {
1131 // Print vgpr scores.
1132 unsigned LB = getScoreLB(T);
1133
1134 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1135 sort(SortedVMEMIDs);
1136
1137 for (auto ID : SortedVMEMIDs) {
1138 unsigned RegScore = VMem.at(ID).Scores[T];
1139 if (RegScore <= LB)
1140 continue;
1141 unsigned RelScore = RegScore - LB - 1;
1142 if (ID < REGUNITS_END) {
1143 OS << ' ' << RelScore << ":vRU" << ID;
1144 } else {
1145 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1146 "Unhandled/unexpected ID value!");
1147 OS << ' ' << RelScore << ":LDSDMA" << ID;
1148 }
1149 }
1150
1151 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1152 if (isSmemCounter(T)) {
1153 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1154 sort(SortedSMEMIDs);
1155 for (auto ID : SortedSMEMIDs) {
1156 unsigned RegScore = SGPRs.at(ID).get(T);
1157 if (RegScore <= LB)
1158 continue;
1159 unsigned RelScore = RegScore - LB - 1;
1160 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1161 }
1162 }
1163
1164 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1165 OS << ' ' << SCCScore << ":scc";
1166 }
1167 OS << '\n';
1168 }
1169
1170 OS << "Pending Events: ";
1171 if (hasPendingEvent()) {
1172 OS << getPendingEvents();
1173 } else {
1174 OS << "none";
1175 }
1176 OS << '\n';
1177
1178 OS << "Async score: ";
1179 if (AsyncScore.empty())
1180 OS << "none";
1181 else
1182 llvm::interleaveComma(AsyncScore, OS);
1183 OS << '\n';
1184
1185 OS << "Async marks: " << AsyncMarks.size() << '\n';
1186
1187 for (const auto &Mark : AsyncMarks) {
1188 for (auto T : AMDGPU::inst_counter_types()) {
1189 unsigned MarkedScore = Mark[T];
1190 switch (T) {
1191 case AMDGPU::LOAD_CNT:
1192 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1193 << "_CNT: " << MarkedScore;
1194 break;
1195 case AMDGPU::DS_CNT:
1196 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1197 << "_CNT: " << MarkedScore;
1198 break;
1199 case AMDGPU::EXP_CNT:
1200 OS << " EXP_CNT: " << MarkedScore;
1201 break;
1202 case AMDGPU::STORE_CNT:
1203 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1204 << "_CNT: " << MarkedScore;
1205 break;
1206 case AMDGPU::SAMPLE_CNT:
1207 OS << " SAMPLE_CNT: " << MarkedScore;
1208 break;
1209 case AMDGPU::BVH_CNT:
1210 OS << " BVH_CNT: " << MarkedScore;
1211 break;
1212 case AMDGPU::KM_CNT:
1213 OS << " KM_CNT: " << MarkedScore;
1214 break;
1215 case AMDGPU::X_CNT:
1216 OS << " X_CNT: " << MarkedScore;
1217 break;
1218 case AMDGPU::ASYNC_CNT:
1219 OS << " ASYNC_CNT: " << MarkedScore;
1220 break;
1221 default:
1222 OS << " UNKNOWN: " << MarkedScore;
1223 break;
1224 }
1225 }
1226 OS << '\n';
1227 }
1228 OS << '\n';
1229}
1230
1231/// Simplify \p UpdateWait by removing waits that are redundant based on the
1232/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1233void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1234 AMDGPU::Waitcnt &UpdateWait) const {
1235 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1236 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1237 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1238 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1239 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1240 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1241 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1242 simplifyXcnt(CheckWait, UpdateWait);
1243 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1244 simplifyVmVsrc(CheckWait, UpdateWait);
1245 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1246}
1247
1248void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1249 unsigned &Count) const {
1250 // The number of outstanding events for this type, T, can be calculated
1251 // as (UB - LB). If the current Count is greater than or equal to the number
1252 // of outstanding events, then the wait for this counter is redundant.
1253 if (Count >= getScoreRange(T))
1254 Count = ~0u;
1255}
1256
1257void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1258 AMDGPU::InstCounterType T) const {
1259 unsigned Cnt = Wait.get(T);
1260 simplifyWaitcnt(T, Cnt);
1261 Wait.set(T, Cnt);
1262}
1263
1264void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1265 AMDGPU::Waitcnt &UpdateWait) const {
1266 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1267 // optimizations. On entry to a block with multiple predescessors, there may
1268 // be pending SMEM and VMEM events active at the same time.
1269 // In such cases, only clear one active event at a time.
1270 // TODO: Revisit xcnt optimizations for gfx1250.
1271 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1272 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1273 // zero.
1274 if (CheckWait.get(AMDGPU::KM_CNT) == 0 &&
1275 hasPendingEvent(HWEvents::SMEM_GROUP))
1276 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1277 // If we have pending store we cannot optimize XCnt because we do not wait for
1278 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1279 // decremented to the same number as LOADCnt.
1280 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u &&
1281 hasPendingEvent(HWEvents::VMEM_GROUP) &&
1282 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1283 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1284 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1285 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1286}
1287
1288void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1289 AMDGPU::Waitcnt &UpdateWait) const {
1290 // Waiting for some counters implies waiting for VM_VSRC, since an
1291 // instruction that decrements a counter on completion would have
1292 // decremented VM_VSRC once its VGPR operands had been read.
1293 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1294 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1295 CheckWait.get(AMDGPU::STORE_CNT),
1296 CheckWait.get(AMDGPU::SAMPLE_CNT),
1297 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1298 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1299 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1300}
1301
1302void WaitcntBrackets::purgeEmptyTrackingData() {
1303 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1304 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1305}
1306
1307void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1308 unsigned ScoreToWait,
1309 AMDGPU::Waitcnt &Wait) const {
1310 const unsigned LB = getScoreLB(T);
1311 const unsigned UB = getScoreUB(T);
1312
1313 // If the score falls within the bracket, we need a waitcnt.
1314 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1315 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1316 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1317 // If there is a pending FLAT operation, and this is a VMem or LGKM
1318 // waitcnt and the target can report early completion, then we need
1319 // to force a waitcnt 0.
1320 Wait.add(T, 0);
1321 } else if (counterOutOfOrder(T)) {
1322 // Counter can get decremented out-of-order when there
1323 // are multiple types event in the bracket. Also emit an s_wait counter
1324 // with a conservative value of 0 for the counter.
1325 Wait.add(T, 0);
1326 } else {
1327 // If a counter has been maxed out avoid overflow by waiting for
1328 // MAX(CounterType) - 1 instead.
1329 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(T) - 1);
1330 Wait.add(T, NeededWait);
1331 }
1332 }
1333}
1334
1335AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1336 LLVM_DEBUG({
1337 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1338 << ":\n";
1339 for (const auto &Mark : AsyncMarks) {
1340 llvm::interleaveComma(Mark, dbgs());
1341 dbgs() << '\n';
1342 }
1343 });
1344
1345 if (AsyncMarks.size() == MaxAsyncMarks) {
1346 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1347 // MaxAsyncMarks is linear when traversing straightline code. But we do
1348 // need to check if truncation may have occured at a merge, and adjust N
1349 // to ensure that a wait is generated.
1350 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1351 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1352 }
1353
1354 AMDGPU::Waitcnt Wait;
1355 if (AsyncMarks.size() <= N) {
1356 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1357 return Wait;
1358 }
1359
1360 size_t MarkIndex = AsyncMarks.size() - N - 1;
1361 const auto &RequiredMark = AsyncMarks[MarkIndex];
1363 determineWaitForScore(T, RequiredMark[T], Wait);
1364
1365 // Immediately remove the waited mark and all older ones
1366 // This happens BEFORE the wait is actually inserted, which is fine
1367 // because we've already extracted the wait requirements
1368 LLVM_DEBUG({
1369 dbgs() << "Removing " << (MarkIndex + 1)
1370 << " async marks after determining wait\n";
1371 });
1372 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1373
1374 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1375 return Wait;
1376}
1377
1378// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1379// other half 16bit.
1380//
1381// Replace VGPR16 to VGPR32 for wait check if:
1382// 1. MI is a VALU, and there is a wait event on the other half
1383// 2. MI is a LdSt, and there is a wait event on the other half from different
1384// order group
1385MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1387 MCPhysReg Reg) const {
1388 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1389 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1390
1391 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1392 return Reg;
1393
1394 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1395 // check dependency on the other half
1396 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1397 Register OtherHalf = Context->TRI.getSubReg(
1398 Reg32,
1399 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1400
1401 AMDGPU::Waitcnt Wait;
1402 for (MCRegUnit RU : regunits(OtherHalf))
1403 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1404
1405 // No wait on otherhalf
1406 if (!Wait.hasWait())
1407 return Reg;
1408
1409 if (Context->TII.isVALU(MI, /*AllowLDSDMA=*/true))
1410 return Reg32;
1411
1412 // If hi/lo16 mixed events
1413 HWEvents MIEvents =
1414 AMDGPU::getEventsFor(MI, Context->ST, Context->IsExpertMode);
1415 HWEvents OtherHalfEvents = Context->getWaitEvents(T);
1416 HWEvents Events = MIEvents & OtherHalfEvents;
1417 if (Events.size() > 1)
1418 return Reg32;
1419 return Reg;
1420}
1421
1422void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1423 MCPhysReg Reg,
1424 AMDGPU::Waitcnt &Wait,
1425 const MachineInstr &MI) const {
1426 if (Reg == AMDGPU::SCC) {
1427 determineWaitForScore(T, SCCScore, Wait);
1428 } else {
1429 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1430 if (IsVGPR)
1431 Reg = determineVGPR16Dependency(MI, T, Reg);
1432 for (MCRegUnit RU : regunits(Reg))
1433 determineWaitForScore(
1434 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1435 Wait);
1436 }
1437}
1438
1439void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1440 VMEMID TID,
1441 AMDGPU::Waitcnt &Wait) const {
1442 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1443 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1444}
1445
1446void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1447 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1448 // SCC has landed
1449 if (PendingSCCWrite &&
1450 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1451 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1452 HWEvents SCC_WRITE_PendingEvent = HWEvents::SCC_WRITE;
1453 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1454 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1455 SCC_WRITE_PendingEvent) {
1456 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1457 }
1458
1459 PendingEvents -= SCC_WRITE_PendingEvent;
1460 PendingSCCWrite = nullptr;
1461 }
1462}
1463
1464void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1466 applyWaitcnt(Wait, T);
1467}
1468
1469void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1470 const unsigned UB = getScoreUB(T);
1471 if (Count >= UB)
1472 return;
1473 if (Count != 0) {
1474 if (counterOutOfOrder(T))
1475 return;
1476 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1477 } else {
1478 setScoreLB(T, UB);
1479 PendingEvents -= Context->getWaitEvents(T);
1480 }
1481
1482 if (T == AMDGPU::KM_CNT && Count == 0 &&
1483 hasPendingEvent(HWEvents::SMEM_GROUP)) {
1484 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1485 applyWaitcnt(AMDGPU::X_CNT, 0);
1486 else
1487 PendingEvents -= HWEvents::SMEM_GROUP;
1488 }
1489 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(HWEvents::VMEM_GROUP) &&
1490 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1491 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1492 applyWaitcnt(AMDGPU::X_CNT, Count);
1493 else if (Count == 0)
1494 PendingEvents -= HWEvents::VMEM_GROUP;
1495 }
1496}
1497
1498void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1500 unsigned Cnt = Wait.get(T);
1501 applyWaitcnt(T, Cnt);
1502}
1503
1504// Where there are multiple types of event in the bracket of a counter,
1505// the decrement may go out of order.
1506bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1507 // Scalar memory read always can go out of order.
1508 if ((T == Context->SmemAccessCounter &&
1509 hasPendingEvent(HWEvents::SMEM_ACCESS)) ||
1510 (T == AMDGPU::X_CNT && hasPendingEvent(HWEvents::SMEM_GROUP)))
1511 return true;
1512
1513 if (T == AMDGPU::LOAD_CNT) {
1514
1515 // On targets without VScnt, LOAD_CNT includes all of STORE_CNT as well.
1516 // All these events use one counter and do not go out of order with respect
1517 // to each other.
1518 if (!Context->ST.hasVscnt())
1519 return false;
1520
1521 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
1522
1523 // If the target does not have extended counters, VMEM_BVH/SAMPLE_READ
1524 // events are equivalent to VMEM_READ_ACCESS. We do not go out of order in
1525 // such cases.
1526 static constexpr HWEvents ExtendedImageEvents =
1527 HWEvents::VMEM_SAMPLER_READ_ACCESS | HWEvents::VMEM_BVH_READ_ACCESS;
1528 if (!Context->ST.hasExtendedWaitCounts() &&
1529 (Events & ExtendedImageEvents).any()) {
1530 Events -= ExtendedImageEvents;
1531 Events |= HWEvents::VMEM_READ_ACCESS;
1532 }
1533
1534 // GLOBAL_INV completes in-order with other LOAD_CNT events,
1535 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
1536 // events doesn't cause out-of-order completion.
1537 Events -= HWEvents::GLOBAL_INV_ACCESS;
1538
1539 // Return true only if there are still multiple event types after removing
1540 // GLOBAL_INV
1541 return Events.size() > 1;
1542 }
1543
1544 return hasMixedPendingEvents(T);
1545}
1546
1547INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1548 false, false)
1551INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1553
1554char SIInsertWaitcntsLegacy::ID = 0;
1555
1556char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1557
1559 return new SIInsertWaitcntsLegacy();
1560}
1561
1562static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1563 unsigned NewEnc) {
1564 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1565 assert(OpIdx >= 0);
1566
1567 MachineOperand &MO = MI.getOperand(OpIdx);
1568
1569 if (NewEnc == MO.getImm())
1570 return false;
1571
1572 MO.setImm(NewEnc);
1573 return true;
1574}
1575
1576bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1577 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1578 if (Opcode == Waitcnt->getOpcode())
1579 return false;
1580
1581 Waitcnt->setDesc(TII.get(Opcode));
1582 return true;
1583}
1584
1585/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1586/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1587/// from \p Wait that were added by previous passes. Currently this pass
1588/// conservatively assumes that these preexisting waits are required for
1589/// correctness.
1590bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1591 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1592 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1593 assert(isNormalMode(MaxCounter));
1594
1595 bool Modified = false;
1596 MachineInstr *WaitcntInstr = nullptr;
1597 MachineInstr *WaitcntVsCntInstr = nullptr;
1598
1599 LLVM_DEBUG({
1600 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1601 if (It.isEnd())
1602 dbgs() << "end of block\n";
1603 else
1604 dbgs() << *It;
1605 });
1606
1607 for (auto &II :
1608 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1609 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1610 if (isNonWaitcntMetaInst(II)) {
1611 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1612 continue;
1613 }
1614
1615 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1616 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1617
1618 // Update required wait count. If this is a soft waitcnt (= it was added
1619 // by an earlier pass), it may be entirely removed.
1620 if (Opcode == AMDGPU::S_WAITCNT) {
1621 unsigned IEnc = II.getOperand(0).getImm();
1622 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1623 if (TrySimplify)
1624 ScoreBrackets.simplifyWaitcnt(OldWait);
1625 Wait = Wait.combined(OldWait);
1626
1627 // Merge consecutive waitcnt of the same type by erasing multiples.
1628 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1629 II.eraseFromParent();
1630 Modified = true;
1631 } else
1632 WaitcntInstr = &II;
1633 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1634 assert(ST.hasVMemToLDSLoad());
1635 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1636 << "Before: " << Wait << '\n';);
1637 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1638 Wait);
1639 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1640
1641 // It is possible (but unlikely) that this is the only wait instruction,
1642 // in which case, we exit this loop without a WaitcntInstr to consume
1643 // `Wait`. But that works because `Wait` was passed in by reference, and
1644 // the callee eventually calls createNewWaitcnt on it. We test this
1645 // possibility in an articial MIR test since such a situation cannot be
1646 // recreated by running the memory legalizer.
1647 II.eraseFromParent();
1648 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1649 unsigned N = II.getOperand(0).getImm();
1650 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1651 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1652 Wait = Wait.combined(OldWait);
1653 } else {
1654 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1655 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1656
1657 unsigned OldVSCnt =
1658 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1659 if (TrySimplify)
1660 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1662 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1663
1664 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1665 II.eraseFromParent();
1666 Modified = true;
1667 } else
1668 WaitcntVsCntInstr = &II;
1669 }
1670 }
1671
1672 if (WaitcntInstr) {
1673 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1675 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1676
1677 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1678 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1679 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1680 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1681 Wait.set(AMDGPU::EXP_CNT, ~0u);
1682 Wait.set(AMDGPU::DS_CNT, ~0u);
1683
1684 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1685 << "New Instr at block end: "
1686 << *WaitcntInstr << '\n'
1687 : dbgs() << "applied pre-existing waitcnt\n"
1688 << "Old Instr: " << *It
1689 << "New Instr: " << *WaitcntInstr << '\n');
1690 }
1691
1692 if (WaitcntVsCntInstr) {
1693 Modified |=
1694 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1695 Wait.get(AMDGPU::STORE_CNT));
1696 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1697
1698 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1699 Wait.set(AMDGPU::STORE_CNT, ~0u);
1700
1701 LLVM_DEBUG(It.isEnd()
1702 ? dbgs() << "applied pre-existing waitcnt\n"
1703 << "New Instr at block end: " << *WaitcntVsCntInstr
1704 << '\n'
1705 : dbgs() << "applied pre-existing waitcnt\n"
1706 << "Old Instr: " << *It
1707 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1708 }
1709
1710 return Modified;
1711}
1712
1713/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1714/// required counters in \p Wait
1715bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1716 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1717 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1718 assert(isNormalMode(MaxCounter));
1719
1720 bool Modified = false;
1721 const DebugLoc &DL = Block.findDebugLoc(It);
1722
1723 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1724 // single instruction while VScnt has its own instruction.
1725 if (Wait.hasWaitExceptStoreCnt()) {
1726 // If profiling expansion is enabled, emit an expanded sequence
1727 if (ExpandWaitcntProfiling) {
1728 // Check if any of the counters to be waited on are out-of-order.
1729 // If so, fall back to normal (non-expanded) behavior since expansion
1730 // would provide misleading profiling information.
1731 bool AnyOutOfOrder = false;
1732 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1733 unsigned WaitCnt = Wait.get(CT);
1734 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1735 AnyOutOfOrder = true;
1736 break;
1737 }
1738 }
1739
1740 if (AnyOutOfOrder) {
1741 // Fall back to non-expanded wait
1742 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1743 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1744 Modified = true;
1745 } else {
1746 // All counters are in-order, safe to expand
1747 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1748 unsigned WaitCnt = Wait.get(CT);
1749 if (WaitCnt == ~0u)
1750 continue;
1751
1752 unsigned Outstanding =
1753 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1754 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1755 AMDGPU::Waitcnt W;
1756 W.set(CT, Count);
1757 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1759 });
1760 Modified = true;
1761 }
1762 }
1763 } else {
1764 // Normal behavior: emit single combined waitcnt
1765 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1766 [[maybe_unused]] auto SWaitInst =
1767 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1768 Modified = true;
1769
1770 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1771 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1772 dbgs() << "New Instr: " << *SWaitInst << '\n');
1773 }
1774 }
1775
1776 if (Wait.hasWaitStoreCnt()) {
1777 assert(ST.hasVscnt());
1778
1779 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
1780 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
1781 // Only expand if counter is not out-of-order
1782 unsigned Outstanding =
1783 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
1784 getLimit(AMDGPU::STORE_CNT) - 1);
1785 EmitExpandedWaitcnt(
1786 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
1787 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1788 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1789 .addImm(Count);
1790 });
1791 Modified = true;
1792 } else {
1793 [[maybe_unused]] auto SWaitInst =
1794 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1795 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1797 Modified = true;
1798
1799 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1800 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1801 dbgs() << "New Instr: " << *SWaitInst << '\n');
1802 }
1803 }
1804
1805 return Modified;
1806}
1807
1808AMDGPU::Waitcnt
1809WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1810 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1811}
1812
1813AMDGPU::Waitcnt
1814WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1815 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1816 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1817 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
1818 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
1819}
1820
1821/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1822/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1823/// were added by previous passes. Currently this pass conservatively
1824/// assumes that these preexisting waits are required for correctness.
1825bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1826 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1827 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1828 assert(!isNormalMode(MaxCounter));
1829
1830 bool Modified = false;
1831 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1832 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1833 MachineInstr *WaitcntDepctrInstr = nullptr;
1834 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
1835
1836 LLVM_DEBUG({
1837 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1838 if (It.isEnd())
1839 dbgs() << "end of block\n";
1840 else
1841 dbgs() << *It;
1842 });
1843
1844 // Accumulate waits that should not be simplified.
1845 AMDGPU::Waitcnt RequiredWait;
1846
1847 for (auto &II :
1848 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1849 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1850 if (isNonWaitcntMetaInst(II)) {
1851 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1852 continue;
1853 }
1854
1855 // Update required wait count. If this is a soft waitcnt (= it was added
1856 // by an earlier pass), it may be entirely removed.
1857
1858 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1859 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1860
1861 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1862 // attempt to do more than that either.
1863 if (Opcode == AMDGPU::S_WAITCNT)
1864 continue;
1865
1866 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1867 unsigned OldEnc =
1868 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1869 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1870 if (TrySimplify)
1871 Wait = Wait.combined(OldWait);
1872 else
1873 RequiredWait = RequiredWait.combined(OldWait);
1874 // Keep the first wait_loadcnt, erase the rest.
1875 if (CombinedLoadDsCntInstr == nullptr) {
1876 CombinedLoadDsCntInstr = &II;
1877 } else {
1878 II.eraseFromParent();
1879 Modified = true;
1880 }
1881 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1882 unsigned OldEnc =
1883 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1884 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1885 if (TrySimplify)
1886 Wait = Wait.combined(OldWait);
1887 else
1888 RequiredWait = RequiredWait.combined(OldWait);
1889 // Keep the first wait_storecnt, erase the rest.
1890 if (CombinedStoreDsCntInstr == nullptr) {
1891 CombinedStoreDsCntInstr = &II;
1892 } else {
1893 II.eraseFromParent();
1894 Modified = true;
1895 }
1896 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1897 unsigned OldEnc =
1898 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1899 AMDGPU::Waitcnt OldWait;
1902 if (TrySimplify)
1903 ScoreBrackets.simplifyWaitcnt(OldWait);
1904 Wait = Wait.combined(OldWait);
1905 if (WaitcntDepctrInstr == nullptr) {
1906 WaitcntDepctrInstr = &II;
1907 } else {
1908 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1909 // duplicate if it is waiting on things other than VA_VDST or
1910 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1911 // VM_VSRC subfields of the operand are set to the "no wait"
1912 // values.
1913
1914 unsigned Enc =
1915 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1916 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1917 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1918
1919 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
1920 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1921 Modified |= promoteSoftWaitCnt(&II);
1922 } else {
1923 II.eraseFromParent();
1924 Modified = true;
1925 }
1926 }
1927 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1928 // Architectures higher than GFX10 do not have direct loads to
1929 // LDS, so no work required here yet.
1930 II.eraseFromParent();
1931 Modified = true;
1932 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1933 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
1934 // shows up in the assembly as a comment with the original parameter N.
1935 unsigned N = II.getOperand(0).getImm();
1936 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1937 Wait = Wait.combined(OldWait);
1938 } else {
1939 std::optional<AMDGPU::InstCounterType> CT =
1941 assert(CT.has_value());
1942 unsigned OldCnt =
1943 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1944 if (TrySimplify)
1945 Wait.add(CT.value(), OldCnt);
1946 else
1947 RequiredWait.add(CT.value(), OldCnt);
1948 // Keep the first wait of its kind, erase the rest.
1949 if (WaitInstrs[CT.value()] == nullptr) {
1950 WaitInstrs[CT.value()] = &II;
1951 } else {
1952 II.eraseFromParent();
1953 Modified = true;
1954 }
1955 }
1956 }
1957
1958 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1959 Wait = Wait.combined(RequiredWait);
1960
1961 if (CombinedLoadDsCntInstr) {
1962 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1963 // to be waited for. Otherwise, let the instruction be deleted so
1964 // the appropriate single counter wait instruction can be inserted
1965 // instead, when new S_WAIT_*CNT instructions are inserted by
1966 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1967 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1968 // the loop below that deals with single counter instructions.
1969 //
1970 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1971 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1972 // will have needed to wait for their register sources to be available
1973 // first.
1974 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
1975 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1976 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1977 AMDGPU::OpName::simm16, NewEnc);
1978 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1979 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
1980 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
1981 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1982 Wait.set(AMDGPU::DS_CNT, ~0u);
1983
1984 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1985 << "New Instr at block end: "
1986 << *CombinedLoadDsCntInstr << '\n'
1987 : dbgs() << "applied pre-existing waitcnt\n"
1988 << "Old Instr: " << *It << "New Instr: "
1989 << *CombinedLoadDsCntInstr << '\n');
1990 } else {
1991 CombinedLoadDsCntInstr->eraseFromParent();
1992 Modified = true;
1993 }
1994 }
1995
1996 if (CombinedStoreDsCntInstr) {
1997 // Similarly for S_WAIT_STORECNT_DSCNT.
1998 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
1999 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2000 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2001 AMDGPU::OpName::simm16, NewEnc);
2002 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2003 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2004 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2005 Wait.set(AMDGPU::STORE_CNT, ~0u);
2006 Wait.set(AMDGPU::DS_CNT, ~0u);
2007
2008 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2009 << "New Instr at block end: "
2010 << *CombinedStoreDsCntInstr << '\n'
2011 : dbgs() << "applied pre-existing waitcnt\n"
2012 << "Old Instr: " << *It << "New Instr: "
2013 << *CombinedStoreDsCntInstr << '\n');
2014 } else {
2015 CombinedStoreDsCntInstr->eraseFromParent();
2016 Modified = true;
2017 }
2018 }
2019
2020 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2021 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2022 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2023 // instructions so that createNewWaitcnt() will create new combined
2024 // instructions to replace them.
2025
2026 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2027 // This is a vector of addresses in WaitInstrs pointing to instructions
2028 // that should be removed if they are present.
2030
2031 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2032 // both) need to be waited for, ensure that there are no existing
2033 // individual wait count instructions for these.
2034
2035 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2036 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2037 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2038 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2039 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2040 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2041 }
2042
2043 for (MachineInstr **WI : WaitsToErase) {
2044 if (!*WI)
2045 continue;
2046
2047 (*WI)->eraseFromParent();
2048 *WI = nullptr;
2049 Modified = true;
2050 }
2051 }
2052
2054 if (!WaitInstrs[CT])
2055 continue;
2056
2057 unsigned NewCnt = Wait.get(CT);
2058 if (NewCnt != ~0u) {
2059 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2060 AMDGPU::OpName::simm16, NewCnt);
2061 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2062
2063 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2064 Wait.clear(CT);
2065
2066 LLVM_DEBUG(It.isEnd()
2067 ? dbgs() << "applied pre-existing waitcnt\n"
2068 << "New Instr at block end: " << *WaitInstrs[CT]
2069 << '\n'
2070 : dbgs() << "applied pre-existing waitcnt\n"
2071 << "Old Instr: " << *It
2072 << "New Instr: " << *WaitInstrs[CT] << '\n');
2073 } else {
2074 WaitInstrs[CT]->eraseFromParent();
2075 Modified = true;
2076 }
2077 }
2078
2079 if (WaitcntDepctrInstr) {
2080 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2081 // subfields with the new required values.
2082 unsigned Enc =
2083 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2084 ->getImm();
2087
2088 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2089 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2090 Wait.set(AMDGPU::VA_VDST, ~0u);
2091 Wait.set(AMDGPU::VM_VSRC, ~0u);
2092
2093 // If that new encoded Depctr immediate would actually still wait
2094 // for anything, update the instruction's operand. Otherwise it can
2095 // just be deleted.
2096 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2097 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2098 AMDGPU::OpName::simm16, Enc);
2099 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2100 << "New Instr at block end: "
2101 << *WaitcntDepctrInstr << '\n'
2102 : dbgs() << "applyPreexistingWaitcnt\n"
2103 << "Old Instr: " << *It << "New Instr: "
2104 << *WaitcntDepctrInstr << '\n');
2105 } else {
2106 WaitcntDepctrInstr->eraseFromParent();
2107 Modified = true;
2108 }
2109 }
2110
2111 return Modified;
2112}
2113
2114/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2115bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2116 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2117 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2118 assert(!isNormalMode(MaxCounter));
2119
2120 bool Modified = false;
2121 const DebugLoc &DL = Block.findDebugLoc(It);
2122
2123 // For GFX12+, we use separate wait instructions, which makes expansion
2124 // simpler
2125 if (ExpandWaitcntProfiling) {
2127 unsigned Count = Wait.get(CT);
2128 if (Count == ~0u)
2129 continue;
2130
2131 // Skip expansion for out-of-order counters - emit normal wait instead
2132 if (ScoreBrackets.counterOutOfOrder(CT)) {
2133 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2134 .addImm(Count);
2135 Modified = true;
2136 continue;
2137 }
2138
2139 unsigned Outstanding =
2140 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2141 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2142 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2143 .addImm(Val);
2144 });
2145 Modified = true;
2146 }
2147 return Modified;
2148 }
2149
2150 // Normal behavior (no expansion)
2151 // Check for opportunities to use combined wait instructions.
2152 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2153 MachineInstr *SWaitInst = nullptr;
2154
2155 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2156 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2157
2158 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2159 .addImm(Enc);
2160
2161 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2162 Wait.set(AMDGPU::DS_CNT, ~0u);
2163 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2164 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2165
2166 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2167 .addImm(Enc);
2168
2169 Wait.set(AMDGPU::STORE_CNT, ~0u);
2170 Wait.set(AMDGPU::DS_CNT, ~0u);
2171 }
2172
2173 if (SWaitInst) {
2174 Modified = true;
2175
2176 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2177 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2178 dbgs() << "New Instr: " << *SWaitInst << '\n');
2179 }
2180 }
2181
2182 // Generate an instruction for any remaining counter that needs
2183 // waiting for.
2184
2186 unsigned Count = Wait.get(CT);
2187 if (Count == ~0u)
2188 continue;
2189
2190 [[maybe_unused]] auto SWaitInst =
2191 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2192 .addImm(Count);
2193
2194 Modified = true;
2195
2196 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2197 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2198 dbgs() << "New Instr: " << *SWaitInst << '\n');
2199 }
2200
2201 if (Wait.hasWaitDepctr()) {
2202 assert(IsExpertMode);
2203 unsigned Enc =
2206
2207 [[maybe_unused]] auto SWaitInst =
2208 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2209
2210 Modified = true;
2211
2212 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2213 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2214 dbgs() << "New Instr: " << *SWaitInst << '\n');
2215 }
2216
2217 return Modified;
2218}
2219
2220/// Generate s_waitcnt instruction to be placed before cur_Inst.
2221/// Instructions of a given type are returned in order,
2222/// but instructions of different types can complete out of order.
2223/// We rely on this in-order completion
2224/// and simply assign a score to the memory access instructions.
2225/// We keep track of the active "score bracket" to determine
2226/// if an access of a memory read requires an s_waitcnt
2227/// and if so what the value of each counter is.
2228/// The "score bracket" is bound by the lower bound and upper bound
2229/// scores (*_score_LB and *_score_ub respectively).
2230/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2231/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2232/// (GFX12+ only, where DS_CNT is a separate counter).
2233bool SIInsertWaitcnts::generateWaitcntInstBefore(
2234 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2235 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2236 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2237
2238 assert(!isNonWaitcntMetaInst(MI));
2239
2240 AMDGPU::Waitcnt Wait;
2241 const unsigned Opc = MI.getOpcode();
2242
2243 switch (Opc) {
2244 case AMDGPU::BUFFER_WBINVL1:
2245 case AMDGPU::BUFFER_WBINVL1_SC:
2246 case AMDGPU::BUFFER_WBINVL1_VOL:
2247 case AMDGPU::BUFFER_GL0_INV:
2248 case AMDGPU::BUFFER_GL1_INV: {
2249 // FIXME: This should have already been handled by the memory legalizer.
2250 // Removing this currently doesn't affect any lit tests, but we need to
2251 // verify that nothing was relying on this. The number of buffer invalidates
2252 // being handled here should not be expanded.
2253 Wait.set(AMDGPU::LOAD_CNT, 0);
2254 break;
2255 }
2256 case AMDGPU::SI_RETURN_TO_EPILOG:
2257 case AMDGPU::SI_RETURN:
2258 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2259 case AMDGPU::S_SETPC_B64_return: {
2260 // All waits must be resolved at call return.
2261 // NOTE: this could be improved with knowledge of all call sites or
2262 // with knowledge of the called routines.
2263 ReturnInsts.insert(&MI);
2264 AMDGPU::Waitcnt AllZeroWait =
2265 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2266 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2267 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2268 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2269 // no need to wait for it at function boundaries.
2270 if (ST.hasExtendedWaitCounts() &&
2271 !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
2272 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2273 Wait = AllZeroWait;
2274 break;
2275 }
2276 case AMDGPU::S_ENDPGM:
2277 case AMDGPU::S_ENDPGM_SAVED: {
2278 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2279 // Technically the hardware will do this on its own if we don't, but that
2280 // might cost extra cycles compared to doing it explicitly.
2281 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2282 // have to wait for outstanding VMEM stores. In this case it can be useful
2283 // to send a message to explicitly release all VGPRs before the stores have
2284 // completed, but it is only safe to do this if there are no outstanding
2285 // scratch stores.
2286 EndPgmInsts[&MI] =
2287 !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2288 !ScoreBrackets.hasPendingEvent(HWEvents::SCRATCH_WRITE_ACCESS);
2289 break;
2290 }
2291 case AMDGPU::S_SENDMSG:
2292 case AMDGPU::S_SENDMSGHALT: {
2293 if (ST.hasLegacyGeometry() &&
2294 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2296 // Resolve vm waits before gs-done.
2297 Wait.set(AMDGPU::LOAD_CNT, 0);
2298 break;
2299 }
2300 [[fallthrough]];
2301 }
2302 default: {
2303
2304 // Export & GDS instructions do not read the EXEC mask until after the
2305 // export is granted (which can occur well after the instruction is issued).
2306 // The shader program must flush all EXP operations on the export-count
2307 // before overwriting the EXEC mask.
2308 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2309 // Export and GDS are tracked individually, either may trigger a waitcnt
2310 // for EXEC.
2311 if (ScoreBrackets.hasPendingEvent(HWEvents::EXP_GPR_LOCK) ||
2312 ScoreBrackets.hasPendingEvent(HWEvents::EXP_PARAM_ACCESS) ||
2313 ScoreBrackets.hasPendingEvent(HWEvents::EXP_POS_ACCESS) ||
2314 ScoreBrackets.hasPendingEvent(HWEvents::GDS_GPR_LOCK)) {
2315 Wait.set(AMDGPU::EXP_CNT, 0);
2316 }
2317 }
2318
2319 // Wait for any pending GDS instruction to complete before any
2320 // "Always GDS" instruction.
2321 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2322 Wait.add(AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2323
2324 if (MI.isCall()) {
2325 // The function is going to insert a wait on everything in its prolog.
2326 // This still needs to be careful if the call target is a load (e.g. a GOT
2327 // load). We also need to check WAW dependency with saved PC.
2328 CallInsts.insert(&MI);
2329 Wait = AMDGPU::Waitcnt();
2330
2331 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2332 if (CallAddrOp.isReg()) {
2333 ScoreBrackets.determineWaitForPhysReg(
2334 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2335
2336 if (const auto *RtnAddrOp =
2337 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2338 ScoreBrackets.determineWaitForPhysReg(
2339 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2340 }
2341 }
2342 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2343 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2344 } else {
2345 // FIXME: Should not be relying on memoperands.
2346 // Look at the source operands of every instruction to see if
2347 // any of them results from a previous memory operation that affects
2348 // its current usage. If so, an s_waitcnt instruction needs to be
2349 // emitted.
2350 // If the source operand was defined by a load, add the s_waitcnt
2351 // instruction.
2352 //
2353 // Two cases are handled for destination operands:
2354 // 1) If the destination operand was defined by a load, add the s_waitcnt
2355 // instruction to guarantee the right WAW order.
2356 // 2) If a destination operand that was used by a recent export/store ins,
2357 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2358
2359 for (const MachineMemOperand *Memop : MI.memoperands()) {
2360 const Value *Ptr = Memop->getValue();
2361 if (Memop->isStore()) {
2362 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2363 Wait.add(SmemAccessCounter, 0);
2364 if (PDT.dominates(MI.getParent(), It->second))
2365 SLoadAddresses.erase(It);
2366 }
2367 }
2368 unsigned AS = Memop->getAddrSpace();
2370 continue;
2371 // No need to wait before load from VMEM to LDS.
2372 if (TII.mayWriteLDSThroughDMA(MI))
2373 continue;
2374
2375 // LOAD_CNT is only relevant to vgpr or LDS.
2376 unsigned TID = LDSDMA_BEGIN;
2377 if (Ptr && Memop->getAAInfo()) {
2378 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2379 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2380 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2381 if ((I + 1) >= NUM_LDSDMA) {
2382 // We didn't have enough slot to track this LDS DMA store, it
2383 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2384 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2385 Wait);
2386 break;
2387 }
2388
2389 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2390 TID + I + 1, Wait);
2391 }
2392 }
2393 } else {
2394 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2395 }
2396 if (Memop->isStore()) {
2397 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2398 }
2399 }
2400
2401 // Loop over use and def operands.
2402 for (const MachineOperand &Op : MI.operands()) {
2403 if (!Op.isReg())
2404 continue;
2405
2406 // If the instruction does not read tied source, skip the operand.
2407 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2408 continue;
2409
2410 MCPhysReg Reg = Op.getReg().asMCReg();
2411
2412 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2413 if (IsVGPR) {
2414 // Implicit VGPR defs and uses are never a part of the memory
2415 // instructions description and usually present to account for
2416 // super-register liveness.
2417 // TODO: Most of the other instructions also have implicit uses
2418 // for the liveness accounting only.
2419 if (Op.isImplicit() && MI.mayLoadOrStore())
2420 continue;
2421
2422 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2423 if (Op.isDef())
2424 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2425 MI);
2426 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2427 // previous write and this write are the same type of VMEM
2428 // instruction, in which case they are (in some architectures)
2429 // guaranteed to write their results in order anyway.
2430 // Additionally check instructions where Point Sample Acceleration
2431 // might be applied.
2432 if (Op.isUse() || !updateVMCntOnly(MI) ||
2433 ScoreBrackets.hasDifferentVGPRPendingEvents(
2435 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2436 !ST.hasVmemWriteVgprInOrder()) {
2437 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2438 MI);
2439 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2440 MI);
2441 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2442 MI);
2443 ScoreBrackets.clearVGPRPendingEvents(Reg);
2444 }
2445
2446 if (Op.isDef() ||
2447 ScoreBrackets.hasPendingEvent(HWEvents::EXP_LDS_ACCESS)) {
2448 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2449 MI);
2450 }
2451 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2452 } else if (Op.getReg() == AMDGPU::SCC) {
2453 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2454 } else {
2455 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2456 MI);
2457 }
2458
2459 if (ST.hasWaitXcnt() && Op.isDef())
2460 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2461 }
2462 }
2463 }
2464 }
2465
2466 // Ensure safety against exceptions from outstanding memory operations while
2467 // waiting for a barrier:
2468 //
2469 // * Some subtargets safely handle backing off the barrier in hardware
2470 // when an exception occurs.
2471 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2472 // there can be no outstanding memory operations during the wait.
2473 // * Subtargets with split barriers don't need to back off the barrier; it
2474 // is up to the trap handler to preserve the user barrier state correctly.
2475 //
2476 // In all other cases, ensure safety by ensuring that there are no outstanding
2477 // memory operations.
2478 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2479 !ST.hasBackOffBarrier()) {
2480 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2481 }
2482
2483 // TODO: Remove this work-around, enable the assert for Bug 457939
2484 // after fixing the scheduler. Also, the Shader Compiler code is
2485 // independent of target.
2486 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2487 ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS)) {
2488 Wait.set(AMDGPU::DS_CNT, 0);
2489 }
2490
2491 // Verify that the wait is actually needed.
2492 ScoreBrackets.simplifyWaitcnt(Wait);
2493
2494 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2495 // waits on VA_VDST if the instruction it would precede is not a VALU
2496 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2497 // expert scheduling mode.
2498 if (TII.isVALU(MI, /*AllowLDSDMA=*/true) && !SIInstrInfo::isLDSDMA(MI))
2499 Wait.set(AMDGPU::VA_VDST, ~0u);
2500
2501 // Since the translation for VMEM addresses occur in-order, we can apply the
2502 // XCnt if the current instruction is of VMEM type and has a memory
2503 // dependency with another VMEM instruction in flight.
2504 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2505 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2506 Wait.set(AMDGPU::X_CNT, ~0u);
2507 }
2508
2509 // When forcing emit, we need to skip terminators because that would break the
2510 // terminators of the MBB if we emit a waitcnt between terminators.
2511 if (ForceEmitZeroFlag && !MI.isTerminator())
2512 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2513
2514 // If we force waitcnt then update Wait accordingly.
2516 if (!ForceEmitWaitcnt[T])
2517 continue;
2518 Wait.set(T, 0);
2519 }
2520
2521 if (FlushFlags.FlushVmCnt) {
2524 Wait.set(T, 0);
2525 }
2526
2527 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2528 Wait.set(AMDGPU::DS_CNT, 0);
2529
2530 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2531 Wait.set(AMDGPU::LOAD_CNT, 0);
2532
2533 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2534 OldWaitcntInstr);
2535}
2536
2537bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2539 MachineBasicBlock &Block,
2540 WaitcntBrackets &ScoreBrackets,
2541 MachineInstr *OldWaitcntInstr) {
2542 bool Modified = false;
2543
2544 if (OldWaitcntInstr)
2545 // Try to merge the required wait with preexisting waitcnt instructions.
2546 // Also erase redundant waitcnt.
2547 Modified =
2548 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2549
2550 // ExpCnt can be merged into VINTERP.
2551 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2553 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2554 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2555 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2556 Modified = true;
2557 }
2558 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2559 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2560 Wait.set(AMDGPU::EXP_CNT, ~0u);
2561
2562 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2563 << "Update Instr: " << *It);
2564 }
2565
2566 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2567 Modified = true;
2568
2569 // Any counts that could have been applied to any existing waitcnt
2570 // instructions will have been done so, now deal with any remaining.
2571 ScoreBrackets.applyWaitcnt(Wait);
2572
2573 return Modified;
2574}
2575
2576bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2577 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2578 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2579}
2580
2581// Return true if the next instruction is S_ENDPGM, following fallthrough
2582// blocks if necessary.
2583bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2584 MachineBasicBlock *Block) const {
2585 auto BlockEnd = Block->getParent()->end();
2586 auto BlockIter = Block->getIterator();
2587
2588 while (true) {
2589 if (It.isEnd()) {
2590 if (++BlockIter != BlockEnd) {
2591 It = BlockIter->instr_begin();
2592 continue;
2593 }
2594
2595 return false;
2596 }
2597
2598 if (!It->isMetaInstruction())
2599 break;
2600
2601 It++;
2602 }
2603
2604 assert(!It.isEnd());
2605
2606 return It->getOpcode() == AMDGPU::S_ENDPGM;
2607}
2608
2609// Add a wait after an instruction if architecture requirements mandate one.
2610bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2611 MachineBasicBlock &Block,
2612 WaitcntBrackets &ScoreBrackets) {
2613 AMDGPU::Waitcnt Wait;
2614 bool NeedsEndPGMCheck = false;
2615
2616 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2617 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2619
2620 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2621 Wait.set(AMDGPU::DS_CNT, 0);
2622 NeedsEndPGMCheck = true;
2623 }
2624
2625 ScoreBrackets.simplifyWaitcnt(Wait);
2626
2627 auto SuccessorIt = std::next(Inst.getIterator());
2628 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2629 /*OldWaitcntInstr=*/nullptr);
2630
2631 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2632 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2633 .addImm(0);
2634 }
2635
2636 return Result;
2637}
2638
2639void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2640 WaitcntBrackets *ScoreBrackets) {
2641
2642 HWEvents InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
2643 for (HWEvents E : InstEvents)
2644 ScoreBrackets->updateByEvent(E, Inst);
2645
2646 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2647 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2648 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2649 ScoreBrackets->setPendingGDS();
2650 }
2651 } else if (TII.isFLAT(Inst)) {
2652 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
2653 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
2654 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2655 // pointers. They do have two operands that each access global and LDS,
2656 // thus making it appear at this point that they are using a flat pointer.
2657 // Filter them out, and for the rest, generate a dependency on flat
2658 // pointers so that both VM and LGKM counters are flushed.
2659 ScoreBrackets->setPendingFlat();
2660 }
2661 } else if (Inst.isCall()) {
2662 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
2663 // included in such blanket waits.
2664 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2665 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2666 } else if (TII.isVINTERP(Inst)) {
2667 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2668 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
2669 }
2670
2671 // Set XCNT to zero in the bracket for instructions that implicitly drain
2672 // XCNT.
2673 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
2674 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
2675}
2676
2677bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2678 unsigned OtherScore) {
2679 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2680 unsigned OtherShifted =
2681 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2682 Score = std::max(MyShifted, OtherShifted);
2683 return OtherShifted > MyShifted;
2684}
2685
2686bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2687 ArrayRef<CounterValueArray> OtherMarks) {
2688 bool StrictDom = false;
2689
2690 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2691 // Early exit: nothing to merge when both sides are empty.
2692 if (AsyncMarks.empty() && OtherMarks.empty()) {
2693 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2694 return false;
2695 }
2696 LLVM_DEBUG(dbgs() << '\n');
2697
2698 // Determine maximum length needed after merging
2699 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
2700 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2701
2702 // Keep only the most recent marks within our limit.
2703 if (AsyncMarks.size() > MaxSize)
2704 AsyncMarks.erase(AsyncMarks.begin(),
2705 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2706
2707 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2708 // pending async operations at this checkpoint" and acts as the identity
2709 // element for max() during merging. We pad at the beginning since the marks
2710 // need to be aligned in most-recent order.
2711 constexpr CounterValueArray ZeroMark{};
2712 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2713
2714 LLVM_DEBUG({
2715 dbgs() << "Before merge:\n";
2716 for (const auto &Mark : AsyncMarks) {
2717 llvm::interleaveComma(Mark, dbgs());
2718 dbgs() << '\n';
2719 }
2720 dbgs() << "Other marks:\n";
2721 for (const auto &Mark : OtherMarks) {
2722 llvm::interleaveComma(Mark, dbgs());
2723 dbgs() << '\n';
2724 }
2725 });
2726
2727 // Merge element-wise using the existing mergeScore function and the
2728 // appropriate MergeInfo for each counter type. Iterate only while we have
2729 // elements in both vectors.
2730 unsigned OtherSize = OtherMarks.size();
2731 unsigned OurSize = AsyncMarks.size();
2732 unsigned MergeCount = std::min(OtherSize, OurSize);
2733 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
2734 // Our existing marks are the conservative result; return early to avoid
2735 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
2736 if (MergeCount == 0)
2737 return StrictDom;
2738 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
2739 for (auto T : inst_counter_types(Context->MaxCounter)) {
2740 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
2741 OtherMarks[OtherSize - Idx][T]);
2742 }
2743 }
2744
2745 LLVM_DEBUG({
2746 dbgs() << "After merge:\n";
2747 for (const auto &Mark : AsyncMarks) {
2748 llvm::interleaveComma(Mark, dbgs());
2749 dbgs() << '\n';
2750 }
2751 });
2752
2753 return StrictDom;
2754}
2755
2756/// Merge the pending events and associater score brackets of \p Other into
2757/// this brackets status.
2758///
2759/// Returns whether the merge resulted in a change that requires tighter waits
2760/// (i.e. the merged brackets strictly dominate the original brackets).
2761bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2762 bool StrictDom = false;
2763
2764 // Check if "other" has keys we don't have, and create default entries for
2765 // those. If they remain empty after merging, we will clean it up after.
2766 for (auto K : Other.VMem.keys())
2767 VMem.try_emplace(K);
2768 for (auto K : Other.SGPRs.keys())
2769 SGPRs.try_emplace(K);
2770
2771 // Array to store MergeInfo for each counter type
2772 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
2773
2774 for (auto T : inst_counter_types(Context->MaxCounter)) {
2775 // Merge event flags for this counter
2776 const HWEvents &EventsForT = Context->getWaitEvents(T);
2777 const HWEvents OldEvents = PendingEvents & EventsForT;
2778 const HWEvents OtherEvents = Other.PendingEvents & EventsForT;
2779 if (!OldEvents.contains(OtherEvents))
2780 StrictDom = true;
2781 PendingEvents |= OtherEvents;
2782
2783 // Merge scores for this counter
2784 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2785 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2786 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2787 if (NewUB < ScoreLBs[T])
2788 report_fatal_error("waitcnt score overflow");
2789
2790 MergeInfo &M = MergeInfos[T];
2791 M.OldLB = ScoreLBs[T];
2792 M.OtherLB = Other.ScoreLBs[T];
2793 M.MyShift = NewUB - ScoreUBs[T];
2794 M.OtherShift = NewUB - Other.ScoreUBs[T];
2795
2796 ScoreUBs[T] = NewUB;
2797
2798 if (T == AMDGPU::LOAD_CNT)
2799 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
2800
2801 if (T == AMDGPU::DS_CNT) {
2802 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
2803 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2804 }
2805
2806 if (T == AMDGPU::KM_CNT) {
2807 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2808 if (Other.hasPendingEvent(HWEvents::SCC_WRITE)) {
2809 if (!(OldEvents & HWEvents::SCC_WRITE)) {
2810 PendingSCCWrite = Other.PendingSCCWrite;
2811 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2812 PendingSCCWrite = nullptr;
2813 }
2814 }
2815 }
2816
2817 for (auto &[RegID, Info] : VMem)
2818 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2819
2820 if (isSmemCounter(T)) {
2821 for (auto &[RegID, Info] : SGPRs) {
2822 auto It = Other.SGPRs.find(RegID);
2823 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
2824 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
2825 }
2826 }
2827 }
2828
2829 for (auto &[TID, Info] : VMem) {
2830 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2831 HWEvents NewVGPRContext =
2832 Info.VGPRPendingEvents | It->second.VGPRPendingEvents;
2833 StrictDom |= NewVGPRContext != Info.VGPRPendingEvents;
2834 Info.VGPRPendingEvents = NewVGPRContext;
2835 }
2836 }
2837
2838 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
2839 for (auto T : inst_counter_types(Context->MaxCounter))
2840 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
2841
2842 purgeEmptyTrackingData();
2843 return StrictDom;
2844}
2845
2846static bool isWaitInstr(MachineInstr &Inst) {
2847 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2848 return Opcode == AMDGPU::S_WAITCNT ||
2849 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2850 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2851 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2852 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2853 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2854 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2855 AMDGPU::counterTypeForInstr(Opcode).has_value();
2856}
2857
2858void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2860 bool ExpertMode) const {
2861 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2863 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
2864 .addImm(ExpertMode ? 2 : 0)
2865 .addImm(EncodedReg);
2866}
2867
2868namespace {
2869// TODO: Remove this work-around after fixing the scheduler.
2870// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
2871// and ST.partialVCCWritesUpdateVCCZ().
2872// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
2873// corrupt vccz bit, so when we detect that an instruction may read from
2874// a corrupt vccz bit, we need to:
2875// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2876// operations to complete.
2877// 2. Recompute the correct value of vccz by writing the current value
2878// of vcc back to vcc.
2879// ii. Partial writes to vcc don't update vccz, so we need to recompute the
2880// correct value of vccz by reading vcc and writing it back to vcc.
2881// No waitcnt is needed in this case.
2882class VCCZWorkaround {
2883 const WaitcntBrackets &ScoreBrackets;
2884 const GCNSubtarget &ST;
2885 const SIInstrInfo &TII;
2886 const SIRegisterInfo &TRI;
2887 bool VCCZCorruptionBug = false;
2888 bool VCCZNotUpdatedByPartialWrites = false;
2889 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
2890 /// to vcc and then issued an smem load, so initialize to true.
2891 bool MustRecomputeVCCZ = true;
2892
2893public:
2894 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
2895 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
2896 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
2897 VCCZCorruptionBug = ST.hasReadVCCZBug();
2898 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
2899 }
2900 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
2901 /// then emit a vccz recompute instruction before \p MI. This needs to be
2902 /// called on every instruction in the basic block because it also tracks the
2903 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
2904 /// modified the IR.
2905 bool tryRecomputeVCCZ(MachineInstr &MI) {
2906 // No need to run this if neither bug is present.
2907 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2908 return false;
2909
2910 // If MI is an SMEM and it can corrupt vccz on this target, then we need
2911 // both to emit a waitcnt and to recompute vccz.
2912 // But we don't actually emit a waitcnt here. This is done in
2913 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
2914 // state, and can either skip emitting a waitcnt if there is already one in
2915 // the IR, or emit an "optimized" combined waitcnt.
2916 // If this is an smem read, it could complete and clobber vccz at any time.
2917 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
2918
2919 // If the target partial vcc writes don't update vccz, and MI is such an
2920 // instruction then we must recompute vccz.
2921 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
2922 // `definesRegister()` more than needed, because it's not very cheap.
2923 std::optional<bool> PartiallyWritesToVCCOpt;
2924 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
2925 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2926 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
2927 };
2928 if (VCCZNotUpdatedByPartialWrites) {
2929 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2930 // If this is a partial VCC write but won't update vccz, then we must
2931 // recompute vccz.
2932 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2933 }
2934
2935 // If MI is a vcc write with no pending smem, or there is a pending smem
2936 // but the target does not suffer from the vccz corruption bug, then we
2937 // don't need to recompute vccz as this write will recompute it anyway.
2938 if (!ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS) ||
2939 !VCCZCorruptionBug) {
2940 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
2941 if (!PartiallyWritesToVCCOpt)
2942 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2943 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2944 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
2945 // If we write to the full vcc or we write partially and the target
2946 // updates vccz on partial writes, then vccz will be updated correctly.
2947 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2948 *PartiallyWritesToVCCOpt);
2949 if (UpdatesVCCZ)
2950 MustRecomputeVCCZ = false;
2951 }
2952
2953 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
2954 // restore instruction if either is needed.
2955 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
2956 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
2957 // bit is updated, so we can restore the bit by reading the value of vcc
2958 // and then writing it back to the register.
2959 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2960 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2961 TRI.getVCC())
2962 .addReg(TRI.getVCC());
2963 MustRecomputeVCCZ = false;
2964 return true;
2965 }
2966 return false;
2967 }
2968};
2969
2970} // namespace
2971
2972// Generate s_waitcnt instructions where needed.
2973bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2974 MachineBasicBlock &Block,
2975 WaitcntBrackets &ScoreBrackets) {
2976 bool Modified = false;
2977
2978 LLVM_DEBUG({
2979 dbgs() << "*** Begin Block: ";
2980 Block.printName(dbgs());
2981 ScoreBrackets.dump();
2982 });
2983 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
2984
2985 // Walk over the instructions.
2986 MachineInstr *OldWaitcntInstr = nullptr;
2987
2988 // NOTE: We may append instrs after Inst while iterating.
2989 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2990 E = Block.instr_end();
2991 Iter != E; ++Iter) {
2992 MachineInstr &Inst = *Iter;
2993 if (isNonWaitcntMetaInst(Inst))
2994 continue;
2995 // Track pre-existing waitcnts that were added in earlier iterations or by
2996 // the memory legalizer.
2997 if (isWaitInstr(Inst) ||
2998 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2999 if (!OldWaitcntInstr)
3000 OldWaitcntInstr = &Inst;
3001 continue;
3002 }
3003
3004 PreheaderFlushFlags FlushFlags;
3005 if (Block.getFirstTerminator() == Inst)
3006 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3007
3008 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3009 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3010 FlushFlags);
3011 OldWaitcntInstr = nullptr;
3012
3013 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3014 // Asyncmarks record the current wait state and so should not allow
3015 // waitcnts that occur after them to be merged into waitcnts that occur
3016 // before.
3017 ScoreBrackets.recordAsyncMark(Inst);
3018 continue;
3019 }
3020
3021 if (TII.isSMRD(Inst)) {
3022 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3023 // No need to handle invariant loads when avoiding WAR conflicts, as
3024 // there cannot be a vector store to the same memory location.
3025 if (!Memop->isInvariant()) {
3026 const Value *Ptr = Memop->getValue();
3027 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3028 }
3029 }
3030 }
3031
3032 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3033
3034 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3035 // visited by the loop.
3036 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3037
3038 LLVM_DEBUG({
3039 Inst.print(dbgs());
3040 ScoreBrackets.dump();
3041 });
3042
3043 // If the target suffers from the vccz bugs, this may emit the necessary
3044 // vccz recompute instruction before \p Inst if needed.
3045 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3046 }
3047
3048 // Flush counters at the end of the block if needed (for preheaders with no
3049 // terminator).
3050 AMDGPU::Waitcnt Wait;
3051 if (Block.getFirstTerminator() == Block.end()) {
3052 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3053 if (FlushFlags.FlushVmCnt) {
3054 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3055 Wait.set(AMDGPU::LOAD_CNT, 0);
3056 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3057 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3058 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3059 Wait.set(AMDGPU::BVH_CNT, 0);
3060 }
3061 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3062 Wait.set(AMDGPU::DS_CNT, 0);
3063 }
3064
3065 // Combine or remove any redundant waitcnts at the end of the block.
3066 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3067 OldWaitcntInstr);
3068
3069 LLVM_DEBUG({
3070 dbgs() << "*** End Block: ";
3071 Block.printName(dbgs());
3072 ScoreBrackets.dump();
3073 });
3074
3075 return Modified;
3076}
3077
3078bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3079 if (Block.size() <= 1)
3080 return false;
3081 // The Memory Legalizer conservatively inserts a soft xcnt before each
3082 // atomic RMW operation. However, for sequences of back-to-back atomic
3083 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3084 // the redundant soft xcnts.
3085 bool Modified = false;
3086 // Remember the last atomic with a soft xcnt right before it.
3087 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3088
3089 for (MachineInstr &MI : drop_begin(Block)) {
3090 // Ignore last atomic if non-LDS VMEM and SMEM.
3091 bool IsLDS =
3092 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3093 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3094 LastAtomicWithSoftXcnt = nullptr;
3095
3096 bool IsAtomicRMW =
3097 SIInstrFlags::isMaybeAtomic(MI) && MI.mayLoad() && MI.mayStore();
3098 MachineInstr &PrevMI = *MI.getPrevNode();
3099 // This is an atomic with a soft xcnt.
3100 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3101 // If we have already found an atomic with a soft xcnt, remove this soft
3102 // xcnt as it's redundant.
3103 if (LastAtomicWithSoftXcnt) {
3104 PrevMI.eraseFromParent();
3105 Modified = true;
3106 }
3107 LastAtomicWithSoftXcnt = &MI;
3108 }
3109 }
3110 return Modified;
3111}
3112
3113// Return flags indicating which counters should be flushed in the preheader.
3114PreheaderFlushFlags
3115SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3116 const WaitcntBrackets &ScoreBrackets) {
3117 auto [Iterator, IsInserted] =
3118 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3119 if (!IsInserted)
3120 return Iterator->second;
3121
3122 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3123 if (!Succ)
3124 return PreheaderFlushFlags();
3125
3126 MachineLoop *Loop = MLI.getLoopFor(Succ);
3127 if (!Loop)
3128 return PreheaderFlushFlags();
3129
3130 if (Loop->getLoopPreheader() == &MBB) {
3131 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3132 return Iterator->second;
3133 }
3134
3135 return PreheaderFlushFlags();
3136}
3137
3138bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3140 return TII.mayAccessVMEMThroughFlat(MI);
3141 return SIInstrInfo::isVMEM(MI);
3142}
3143
3144bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3145 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3146}
3147
3148// Check if instruction is a store to LDS that is counted via DSCNT
3149// (where that counter exists).
3150bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3151 return MI.mayStore() && SIInstrInfo::isDS(MI);
3152}
3153
3154// Return flags indicating which counters should be flushed in the preheader of
3155// the given loop. We currently decide to flush in the following situations:
3156// For VMEM (FlushVmCnt):
3157// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3158// vgpr containing a value that is loaded outside of the loop. (Only on
3159// targets with no vscnt counter).
3160// 2. The loop contains vmem load(s), but the loaded values are not used in the
3161// loop, and at least one use of a vgpr containing a value that is loaded
3162// outside of the loop.
3163// For DS (FlushDsCnt, GFX12+ only):
3164// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3165// a value that is DS read outside of the loop.
3166// 4. The loop contains DS read(s), loaded values are not used in the same
3167// iteration but in the next iteration (prefetch pattern), and at least one
3168// use of a vgpr containing a value that is DS read outside of the loop.
3169// Flushing in preheader reduces wait overhead if the wait requirement in
3170// iteration 1 would otherwise be more strict (but unfortunately preheader
3171// flush decision is taken before knowing that).
3172// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3173// tracking. Some DS reads may be used in the same iteration (creating
3174// "flush points"), but others remain unflushed at the backedge. When a DS
3175// read is consumed in the same iteration, it and all prior reads are
3176// "flushed" (FIFO order). No DS writes are allowed in the loop.
3177// TODO: Find a way to extend to multi-block loops.
3178PreheaderFlushFlags
3179SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3180 const WaitcntBrackets &Brackets) {
3181 PreheaderFlushFlags Flags;
3182 bool HasVMemLoad = false;
3183 bool HasVMemStore = false;
3184 bool UsesVgprVMEMLoadedOutside = false;
3185 bool UsesVgprDSReadOutside = false;
3186 bool VMemInvalidated = false;
3187 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3188 // Tracking status for "no DS read in loop" or "pure DS prefetch
3189 // (use only in next iteration)".
3190 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3191 DenseSet<MCRegUnit> VgprUse;
3192 DenseSet<MCRegUnit> VgprDefVMEM;
3193 DenseSet<MCRegUnit> VgprDefDS;
3194
3195 // Track DS reads for prefetch pattern with flush points (single-block only).
3196 // Keeps track of the last DS read (position counted from the top of the loop)
3197 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3198 // the dest register has a use or is overwritten (by any later opertions).
3199 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3200 unsigned DSReadPosition = 0;
3201 bool IsSingleBlock = ML->getNumBlocks() == 1;
3202 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3203 unsigned LastDSFlushPosition = 0;
3204
3205 for (MachineBasicBlock *MBB : ML->blocks()) {
3206 for (MachineInstr &MI : *MBB) {
3207 if (isVMEMOrFlatVMEM(MI)) {
3208 HasVMemLoad |= MI.mayLoad();
3209 HasVMemStore |= MI.mayStore();
3210 }
3211 // TODO: Can we relax DSStore check? There may be cases where
3212 // these DS stores are drained prior to the end of MBB (or loop).
3213 if (mayStoreIncrementingDSCNT(MI)) {
3214 // Early exit if none of the optimizations are feasible.
3215 // Otherwise, set tracking status appropriately and continue.
3216 if (VMemInvalidated)
3217 return Flags;
3218 TrackSimpleDSOpt = false;
3219 TrackDSFlushPoint = false;
3220 }
3221 bool IsDSRead = isDSRead(MI);
3222 if (IsDSRead)
3223 ++DSReadPosition;
3224
3225 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3226 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3227 if (!TrackDSFlushPoint)
3228 return;
3229 if (auto It = LastDSReadPositionMap.find(RU);
3230 It != LastDSReadPositionMap.end()) {
3231 // RU defined by DSRead is used or overwritten. Need to complete
3232 // the read, if not already implied by a later DSRead (to any RU)
3233 // needing to complete in FIFO order.
3234 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3235 }
3236 };
3237
3238 for (const MachineOperand &Op : MI.all_uses()) {
3239 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3240 continue;
3241 // Vgpr use
3242 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3243 // If we find a register that is loaded inside the loop, 1. and 2.
3244 // are invalidated.
3245 if (VgprDefVMEM.contains(RU))
3246 VMemInvalidated = true;
3247
3248 // Check for DS reads used inside the loop
3249 if (VgprDefDS.contains(RU))
3250 TrackSimpleDSOpt = false;
3251
3252 // Early exit if all optimizations are invalidated
3253 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3254 return Flags;
3255
3256 // Check for flush points (DS read used in same iteration)
3257 updateDSReadFlushTracking(RU);
3258
3259 VgprUse.insert(RU);
3260 // Check if this register has a pending VMEM load from outside the
3261 // loop (value loaded outside and used inside).
3262 VMEMID ID = toVMEMID(RU);
3263 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3264 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3265 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3266 UsesVgprVMEMLoadedOutside = true;
3267 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3268 // Only consider it a DS read if there's no pending VMEM load for
3269 // this register, since FLAT can set both counters.
3270 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3271 UsesVgprDSReadOutside = true;
3272 }
3273 }
3274
3275 // VMem load vgpr def
3276 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3277 for (const MachineOperand &Op : MI.all_defs()) {
3278 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3279 // If we find a register that is loaded inside the loop, 1. and 2.
3280 // are invalidated.
3281 if (VgprUse.contains(RU))
3282 VMemInvalidated = true;
3283 VgprDefVMEM.insert(RU);
3284 }
3285 }
3286 // Early exit if all optimizations are invalidated
3287 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3288 return Flags;
3289 }
3290
3291 // DS read vgpr def
3292 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3293 // If USE comes before DEF, it's the prefetch pattern (use value from
3294 // previous iteration, read for next iteration). We should still flush
3295 // in preheader so iteration 1 doesn't need to wait inside the loop.
3296 // Only invalidate when DEF comes before USE (same-iteration consumption,
3297 // checked above when processing uses).
3298 if (IsDSRead || TrackDSFlushPoint) {
3299 for (const MachineOperand &Op : MI.all_defs()) {
3300 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3301 continue;
3302 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3303 // Check for overwrite of pending DS read (flush point) by any
3304 // instruction
3305 updateDSReadFlushTracking(RU);
3306 if (IsDSRead) {
3307 VgprDefDS.insert(RU);
3308 if (TrackDSFlushPoint)
3309 LastDSReadPositionMap[RU] = DSReadPosition;
3310 }
3311 }
3312 }
3313 }
3314 }
3315 }
3316
3317 // VMEM flush decision
3318 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3319 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3320 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3321 Flags.FlushVmCnt = true;
3322
3323 // DS flush decision:
3324 // Simple DS Opt: flush if loop uses DS read values from outside
3325 // and either has no DS reads in the loop, or DS reads whose results
3326 // are not used in the loop.
3327 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3328 // Prefetch with flush points: some DS reads used in same iteration,
3329 // but unflushed reads remain at backedge
3330 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3331 bool DSFlushPointPrefetch =
3332 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3333
3334 if (SimpleDSOpt || DSFlushPointPrefetch)
3335 Flags.FlushDsCnt = true;
3336
3337 return Flags;
3338}
3339
3340bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3341 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3342 auto &PDT =
3343 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3344 AliasAnalysis *AA = nullptr;
3345 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3346 AA = &AAR->getAAResults();
3347
3348 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3349}
3350
3351PreservedAnalyses
3354 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3355 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3357 .getManager()
3358 .getCachedResult<AAManager>(MF.getFunction());
3359
3360 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3361 return PreservedAnalyses::all();
3362
3365 .preserve<AAManager>();
3366}
3367
3368bool SIInsertWaitcnts::run() {
3370
3372
3373 // Initialize hardware limits first, as they're needed by the generators.
3374 Limits = AMDGPU::HardwareLimits(IV);
3375
3376 if (ST.hasExtendedWaitCounts()) {
3377 IsExpertMode = ST.hasExpertSchedulingMode() &&
3378 (ExpertSchedulingModeFlag.getNumOccurrences()
3380 : MF.getFunction()
3381 .getFnAttribute("amdgpu-expert-scheduling-mode")
3382 .getValueAsBool());
3383 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3385 // Initialize WCG per MF. It contains state that depends on MF attributes.
3386 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3387 IsExpertMode);
3388 } else {
3389 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3390 // Initialize WCG per MF. It contains state that depends on MF attributes.
3391 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3392 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3393 }
3394
3395 SmemAccessCounter = getCounterFromEvent(HWEvents::SMEM_ACCESS);
3396
3397 bool Modified = false;
3398
3399 MachineBasicBlock &EntryBB = MF.front();
3400
3401 if (!MFI->isEntryFunction() &&
3402 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3403 // Wait for any outstanding memory operations that the input registers may
3404 // depend on. We can't track them and it's better to do the wait after the
3405 // costly call sequence.
3406
3407 // TODO: Could insert earlier and schedule more liberally with operations
3408 // that only use caller preserved registers.
3410 while (I != EntryBB.end() && I->isMetaInstruction())
3411 ++I;
3412
3413 if (ST.hasExtendedWaitCounts()) {
3414 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3415 .addImm(0);
3417 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3418 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3420 continue;
3421
3422 if (!ST.hasImageInsts() &&
3423 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3424 CT == AMDGPU::BVH_CNT))
3425 continue;
3426
3427 BuildMI(EntryBB, I, DebugLoc(),
3428 TII.get(instrsForExtendedCounterTypes[CT]))
3429 .addImm(0);
3430 }
3431 if (IsExpertMode) {
3432 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3434 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3435 .addImm(Enc);
3436 }
3437 } else {
3438 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3439 }
3440
3441 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3442 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3443 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3444
3445 Modified = true;
3446 }
3447
3448 // Keep iterating over the blocks in reverse post order, inserting and
3449 // updating s_waitcnt where needed, until a fix point is reached.
3450 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3451 BlockInfos.try_emplace(MBB);
3452
3453 std::unique_ptr<WaitcntBrackets> Brackets;
3454 bool Repeat;
3455 do {
3456 Repeat = false;
3457
3458 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3459 ++BII) {
3460 MachineBasicBlock *MBB = BII->first;
3461 BlockInfo &BI = BII->second;
3462 if (!BI.Dirty)
3463 continue;
3464
3465 if (BI.Incoming) {
3466 if (!Brackets)
3467 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3468 else
3469 *Brackets = *BI.Incoming;
3470 } else {
3471 if (!Brackets) {
3472 Brackets = std::make_unique<WaitcntBrackets>(this);
3473 } else {
3474 // Reinitialize in-place. N.B. do not do this by assigning from a
3475 // temporary because the WaitcntBrackets class is large and it could
3476 // cause this function to use an unreasonable amount of stack space.
3477 Brackets->~WaitcntBrackets();
3478 new (Brackets.get()) WaitcntBrackets(this);
3479 }
3480 }
3481
3482 if (ST.hasWaitXcnt())
3483 Modified |= removeRedundantSoftXcnts(*MBB);
3484 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3485 BI.Dirty = false;
3486
3487 if (Brackets->hasPendingEvent()) {
3488 BlockInfo *MoveBracketsToSucc = nullptr;
3489 for (MachineBasicBlock *Succ : MBB->successors()) {
3490 auto *SuccBII = BlockInfos.find(Succ);
3491 BlockInfo &SuccBI = SuccBII->second;
3492 if (!SuccBI.Incoming) {
3493 SuccBI.Dirty = true;
3494 if (SuccBII <= BII) {
3495 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3496 Repeat = true;
3497 }
3498 if (!MoveBracketsToSucc) {
3499 MoveBracketsToSucc = &SuccBI;
3500 } else {
3501 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3502 }
3503 } else {
3504 LLVM_DEBUG({
3505 dbgs() << "Try to merge ";
3506 MBB->printName(dbgs());
3507 dbgs() << " into ";
3508 Succ->printName(dbgs());
3509 dbgs() << '\n';
3510 });
3511 if (SuccBI.Incoming->merge(*Brackets)) {
3512 SuccBI.Dirty = true;
3513 if (SuccBII <= BII) {
3514 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3515 Repeat = true;
3516 }
3517 }
3518 }
3519 }
3520 if (MoveBracketsToSucc)
3521 MoveBracketsToSucc->Incoming = std::move(Brackets);
3522 }
3523 }
3524 } while (Repeat);
3525
3526 if (ST.hasScalarStores()) {
3527 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3528 bool HaveScalarStores = false;
3529
3530 for (MachineBasicBlock &MBB : MF) {
3531 for (MachineInstr &MI : MBB) {
3532 if (!HaveScalarStores && TII.isScalarStore(MI))
3533 HaveScalarStores = true;
3534
3535 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3536 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3537 EndPgmBlocks.push_back(&MBB);
3538 }
3539 }
3540
3541 if (HaveScalarStores) {
3542 // If scalar writes are used, the cache must be flushed or else the next
3543 // wave to reuse the same scratch memory can be clobbered.
3544 //
3545 // Insert s_dcache_wb at wave termination points if there were any scalar
3546 // stores, and only if the cache hasn't already been flushed. This could
3547 // be improved by looking across blocks for flushes in postdominating
3548 // blocks from the stores but an explicitly requested flush is probably
3549 // very rare.
3550 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3551 bool SeenDCacheWB = false;
3552
3553 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3554 I != E; ++I) {
3555 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3556 SeenDCacheWB = true;
3557 else if (TII.isScalarStore(*I))
3558 SeenDCacheWB = false;
3559
3560 // FIXME: It would be better to insert this before a waitcnt if any.
3561 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3562 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3563 !SeenDCacheWB) {
3564 Modified = true;
3565 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3566 }
3567 }
3568 }
3569 }
3570 }
3571
3572 if (IsExpertMode) {
3573 // Enable expert scheduling on function entry. To satisfy ABI requirements
3574 // and to allow calls between function with different expert scheduling
3575 // settings, disable it around calls and before returns.
3576
3578 while (I != EntryBB.end() && I->isMetaInstruction())
3579 ++I;
3580 setSchedulingMode(EntryBB, I, true);
3581
3582 for (MachineInstr *MI : CallInsts) {
3583 MachineBasicBlock &MBB = *MI->getParent();
3584 setSchedulingMode(MBB, MI, false);
3585 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3586 }
3587
3588 for (MachineInstr *MI : ReturnInsts)
3589 setSchedulingMode(*MI->getParent(), MI, false);
3590
3591 Modified = true;
3592 }
3593
3594 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3595 // This is done in different ways depending on how the VGPRs were allocated
3596 // (i.e. whether we're in dynamic VGPR mode or not).
3597 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3598 // waveslot limited kernel runs slower with the deallocation.
3599 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3600 for (auto [MI, _] : EndPgmInsts) {
3601 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3602 TII.get(AMDGPU::S_ALLOC_VGPR))
3603 .addImm(0);
3604 Modified = true;
3605 }
3606 } else if (!WCG->isOptNone() &&
3607 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3608 (MF.getFrameInfo().hasCalls() ||
3609 ST.getOccupancyWithNumVGPRs(
3610 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3611 /*IsDynamicVGPR=*/false) <
3613 for (auto [MI, Flag] : EndPgmInsts) {
3614 if (Flag) {
3615 if (ST.requiresNopBeforeDeallocVGPRs()) {
3616 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3617 TII.get(AMDGPU::S_NOP))
3618 .addImm(0);
3619 }
3620 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3621 TII.get(AMDGPU::S_SENDMSG))
3623 Modified = true;
3624 }
3625 }
3626 }
3627
3628 return Modified;
3629}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
AMDGPU::HWEvents HWEvents
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Bit mask of hardware events.
constexpr unsigned size() const
constexpr bool contains(HWEvents Other) const
constexpr bool any() const
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:223
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:299
bool erase(const KeyT &Val)
Definition DenseMap.h:377
iterator end()
Definition DenseMap.h:141
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:284
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:758
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:723
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:209
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:182
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
HWEvents getSimplifiedVMEMEventsFor(const MachineInstr &Inst, const SIInstrInfo &TII)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
HWEvents getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
constexpr bool isMaybeAtomic(const T &...O)
Definition SIDefines.h:315
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
MCRegisterClass TargetRegisterClass
Definition FastISel.h:58
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.