LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
67 "amdgpu-expert-scheduling-mode",
68 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69 cl::init(false), cl::Hidden);
70
71namespace {
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emitted.
75
76enum InstCounterType {
77 LOAD_CNT = 0, // VMcnt prior to gfx12.
78 DS_CNT, // LKGMcnt prior to gfx12.
79 EXP_CNT, //
80 STORE_CNT, // VScnt in gfx10/gfx11.
81 NUM_NORMAL_INST_CNTS,
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
83 BVH_CNT, // gfx12+ only.
84 KM_CNT, // gfx12+ only.
85 X_CNT, // gfx1250.
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
88 VM_VSRC, // gfx12+ expert mode only.
89 NUM_EXPERT_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
91};
92} // namespace
93
94namespace llvm {
95template <> struct enum_iteration_traits<InstCounterType> {
96 static constexpr bool is_iterable = true;
97};
98} // namespace llvm
99
100namespace {
101// Return an iterator over all counters between LOAD_CNT (the first counter)
102// and \c MaxCounter (exclusive, default value yields an enumeration over
103// all counters).
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
106}
107
108// Get the maximum wait count value for a given counter type.
109static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
110 InstCounterType T) {
111 switch (T) {
112 case LOAD_CNT:
113 return Limits.LoadcntMax;
114 case DS_CNT:
115 return Limits.DscntMax;
116 case EXP_CNT:
117 return Limits.ExpcntMax;
118 case STORE_CNT:
119 return Limits.StorecntMax;
120 case SAMPLE_CNT:
121 return Limits.SamplecntMax;
122 case BVH_CNT:
123 return Limits.BvhcntMax;
124 case KM_CNT:
125 return Limits.KmcntMax;
126 case X_CNT:
127 return Limits.XcntMax;
128 case VA_VDST:
129 return Limits.VaVdstMax;
130 case VM_VSRC:
131 return Limits.VmVsrcMax;
132 default:
133 return 0;
134 }
135}
136
137/// Integer IDs used to track vector memory locations we may have to wait on.
138/// Encoded as u16 chunks:
139///
140/// [0, REGUNITS_END ): MCRegUnit
141/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
142///
143/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
144/// It gives (2 << 16) - 1 entries per category which is more than enough
145/// for all register units. MCPhysReg is u16 so we don't even support >u16
146/// physical register numbers at this time, let alone >u16 register units.
147/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
148/// is enough for all register units.
149using VMEMID = uint32_t;
150
151enum : VMEMID {
152 TRACKINGID_RANGE_LEN = (1 << 16),
153
154 // Important: MCRegUnits must always be tracked starting from 0, as we
155 // need to be able to convert between a MCRegUnit and a VMEMID freely.
156 REGUNITS_BEGIN = 0,
157 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
158
159 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
160 // entry, which is updated for all LDS DMA operations encountered.
161 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
162 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
163 LDSDMA_BEGIN = REGUNITS_END,
164 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
165};
166
167/// Convert a MCRegUnit to a VMEMID.
168static constexpr VMEMID toVMEMID(MCRegUnit RU) {
169 return static_cast<unsigned>(RU);
170}
171
172#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
173 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
174 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
175 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
176 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
177 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
178 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
179 DECL(VMEM_GROUP) /* vmem group */ \
180 DECL(LDS_ACCESS) /* lds read & write */ \
181 DECL(GDS_ACCESS) /* gds read & write */ \
182 DECL(SQ_MESSAGE) /* send message */ \
183 DECL(SCC_WRITE) /* write to SCC from barrier */ \
184 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
185 DECL(SMEM_GROUP) /* scalar-memory group */ \
186 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
187 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
188 DECL(EXP_POS_ACCESS) /* write to export position */ \
189 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
190 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
191 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
192 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
193 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
194 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
195 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
196 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
197 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
198 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
199
200// clang-format off
201#define AMDGPU_EVENT_ENUM(Name) Name,
202enum WaitEventType {
204 NUM_WAIT_EVENTS
205};
206#undef AMDGPU_EVENT_ENUM
207
208#define AMDGPU_EVENT_NAME(Name) #Name,
209static constexpr StringLiteral WaitEventTypeName[] = {
211};
212#undef AMDGPU_EVENT_NAME
213// clang-format on
214
215// Enumerate different types of result-returning VMEM operations. Although
216// s_waitcnt orders them all with a single vmcnt counter, in the absence of
217// s_waitcnt only instructions of the same VmemType are guaranteed to write
218// their results in order -- so there is no need to insert an s_waitcnt between
219// two instructions of the same type that write the same vgpr.
220enum VmemType {
221 // BUF instructions and MIMG instructions without a sampler.
222 VMEM_NOSAMPLER,
223 // MIMG instructions with a sampler.
224 VMEM_SAMPLER,
225 // BVH instructions
226 VMEM_BVH,
227 NUM_VMEM_TYPES
228};
229
230// Maps values of InstCounterType to the instruction that waits on that
231// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
232// returns true, and does not cover VA_VDST or VM_VSRC.
233static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
234 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
235 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
236 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
237
238static bool updateVMCntOnly(const MachineInstr &Inst) {
239 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
241}
242
243#ifndef NDEBUG
244static bool isNormalMode(InstCounterType MaxCounter) {
245 return MaxCounter == NUM_NORMAL_INST_CNTS;
246}
247#endif // NDEBUG
248
249VmemType getVmemType(const MachineInstr &Inst) {
250 assert(updateVMCntOnly(Inst));
251 if (!SIInstrInfo::isImage(Inst))
252 return VMEM_NOSAMPLER;
254 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
256
257 if (BaseInfo->BVH)
258 return VMEM_BVH;
259
260 // We have to make an additional check for isVSAMPLE here since some
261 // instructions don't have a sampler, but are still classified as sampler
262 // instructions for the purposes of e.g. waitcnt.
263 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
264 return VMEM_SAMPLER;
265
266 return VMEM_NOSAMPLER;
267}
268
269unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
270 switch (T) {
271 case LOAD_CNT:
272 return Wait.LoadCnt;
273 case EXP_CNT:
274 return Wait.ExpCnt;
275 case DS_CNT:
276 return Wait.DsCnt;
277 case STORE_CNT:
278 return Wait.StoreCnt;
279 case SAMPLE_CNT:
280 return Wait.SampleCnt;
281 case BVH_CNT:
282 return Wait.BvhCnt;
283 case KM_CNT:
284 return Wait.KmCnt;
285 case X_CNT:
286 return Wait.XCnt;
287 case VA_VDST:
288 return Wait.VaVdst;
289 case VM_VSRC:
290 return Wait.VmVsrc;
291 default:
292 llvm_unreachable("bad InstCounterType");
293 }
294}
295
296void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
297 unsigned &WC = getCounterRef(Wait, T);
298 WC = std::min(WC, Count);
299}
300
301void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
302 getCounterRef(Wait, T) = ~0u;
303}
304
305unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
306 return getCounterRef(Wait, T);
307}
308
309// Mapping from event to counter according to the table masks.
310InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
311 for (auto T : inst_counter_types()) {
312 if (masks[T] & (1 << E))
313 return T;
314 }
315 llvm_unreachable("event type has no associated counter");
316}
317
318class WaitcntBrackets;
319
320// This abstracts the logic for generating and updating S_WAIT* instructions
321// away from the analysis that determines where they are needed. This was
322// done because the set of counters and instructions for waiting on them
323// underwent a major shift with gfx12, sufficiently so that having this
324// abstraction allows the main analysis logic to be simpler than it would
325// otherwise have had to become.
326class WaitcntGenerator {
327protected:
328 const GCNSubtarget *ST = nullptr;
329 const SIInstrInfo *TII = nullptr;
330 AMDGPU::IsaVersion IV;
331 InstCounterType MaxCounter;
332 bool OptNone;
333 bool ExpandWaitcntProfiling = false;
334 const AMDGPU::HardwareLimits *Limits = nullptr;
335
336public:
337 WaitcntGenerator() = default;
338 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
339 const AMDGPU::HardwareLimits *Limits)
340 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
341 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
342 OptNone(MF.getFunction().hasOptNone() ||
343 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
344 ExpandWaitcntProfiling(
345 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
346 Limits(Limits) {}
347
348 // Return true if the current function should be compiled with no
349 // optimization.
350 bool isOptNone() const { return OptNone; }
351
352 const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
353
354 // Edits an existing sequence of wait count instructions according
355 // to an incoming Waitcnt value, which is itself updated to reflect
356 // any new wait count instructions which may need to be generated by
357 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
358 // were made.
359 //
360 // This editing will usually be merely updated operands, but it may also
361 // delete instructions if the incoming Wait value indicates they are not
362 // needed. It may also remove existing instructions for which a wait
363 // is needed if it can be determined that it is better to generate new
364 // instructions later, as can happen on gfx12.
365 virtual bool
366 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
367 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
369
370 // Transform a soft waitcnt into a normal one.
371 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
372
373 // Generates new wait count instructions according to the value of
374 // Wait, returning true if any new instructions were created.
375 // If ScoreBrackets is provided, it can be used for profiling expansion.
376 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
378 AMDGPU::Waitcnt Wait,
379 WaitcntBrackets *ScoreBrackets = nullptr) = 0;
380
381 // Returns an array of bit masks which can be used to map values in
382 // WaitEventType to corresponding counter values in InstCounterType.
383 virtual const unsigned *getWaitEventMask() const = 0;
384
385 // Returns a new waitcnt with all counters except VScnt set to 0. If
386 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
387 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
388
389 virtual ~WaitcntGenerator() = default;
390
391 // Create a mask value from the initializer list of wait event types.
392 static constexpr unsigned
393 eventMask(std::initializer_list<WaitEventType> Events) {
394 unsigned Mask = 0;
395 for (auto &E : Events)
396 Mask |= 1 << E;
397
398 return Mask;
399 }
400};
401
402class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
403public:
404 WaitcntGeneratorPreGFX12() = default;
405 WaitcntGeneratorPreGFX12(const MachineFunction &MF,
406 const AMDGPU::HardwareLimits *Limits)
407 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
408
409 bool
410 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
411 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
412 MachineBasicBlock::instr_iterator It) const override;
413
414 bool createNewWaitcnt(MachineBasicBlock &Block,
416 AMDGPU::Waitcnt Wait,
417 WaitcntBrackets *ScoreBrackets = nullptr) override;
418
419 const unsigned *getWaitEventMask() const override {
420 assert(ST);
421
422 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
423 eventMask(
424 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
425 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
426 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
427 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
428 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
429 0,
430 0,
431 0,
432 0,
433 0,
434 0};
435
436 return WaitEventMaskForInstPreGFX12;
437 }
438
439 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
440};
441
442class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
443protected:
444 bool IsExpertMode;
445
446public:
447 WaitcntGeneratorGFX12Plus() = default;
448 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
449 InstCounterType MaxCounter,
450 const AMDGPU::HardwareLimits *Limits,
451 bool IsExpertMode)
452 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
453
454 bool
455 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
456 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
457 MachineBasicBlock::instr_iterator It) const override;
458
459 bool createNewWaitcnt(MachineBasicBlock &Block,
461 AMDGPU::Waitcnt Wait,
462 WaitcntBrackets *ScoreBrackets = nullptr) override;
463
464 const unsigned *getWaitEventMask() const override {
465 assert(ST);
466
467 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
468 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
469 eventMask({LDS_ACCESS, GDS_ACCESS}),
470 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
471 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
472 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
473 eventMask({VMEM_SAMPLER_READ_ACCESS}),
474 eventMask({VMEM_BVH_READ_ACCESS}),
475 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
476 eventMask({VMEM_GROUP, SMEM_GROUP}),
477 eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
478 VGPR_XDL_WRITE}),
479 eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
480
481 return WaitEventMaskForInstGFX12Plus;
482 }
483
484 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
485};
486
487class SIInsertWaitcnts {
488public:
489 const GCNSubtarget *ST;
490 const SIInstrInfo *TII = nullptr;
491 const SIRegisterInfo *TRI = nullptr;
492 const MachineRegisterInfo *MRI = nullptr;
493 InstCounterType SmemAccessCounter;
494 InstCounterType MaxCounter;
495 bool IsExpertMode = false;
496 const unsigned *WaitEventMaskForInst;
497
498private:
499 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
500 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
501 MachineLoopInfo *MLI;
502 MachinePostDominatorTree *PDT;
503 AliasAnalysis *AA = nullptr;
504
505 struct BlockInfo {
506 std::unique_ptr<WaitcntBrackets> Incoming;
507 bool Dirty = true;
508 };
509
510 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
511
512 bool ForceEmitWaitcnt[NUM_INST_CNTS];
513
514 // In any given run of this pass, WCG will point to one of these two
515 // generator objects, which must have been re-initialised before use
516 // from a value made using a subtarget constructor.
517 WaitcntGeneratorPreGFX12 WCGPreGFX12;
518 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
519
520 WaitcntGenerator *WCG = nullptr;
521
522 // Remember call and return instructions in the function.
523 DenseSet<MachineInstr *> CallInsts;
524 DenseSet<MachineInstr *> ReturnInsts;
525
526 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
527 // message.
528 DenseSet<MachineInstr *> ReleaseVGPRInsts;
529
530 AMDGPU::HardwareLimits Limits;
531
532public:
533 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
534 AliasAnalysis *AA)
535 : MLI(MLI), PDT(PDT), AA(AA) {
536 (void)ForceExpCounter;
537 (void)ForceLgkmCounter;
538 (void)ForceVMCounter;
539 }
540
541 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
542
543 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
544 bool isPreheaderToFlush(MachineBasicBlock &MBB,
545 const WaitcntBrackets &ScoreBrackets);
546 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
547 bool run(MachineFunction &MF);
548
549 void setForceEmitWaitcnt() {
550// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
551// For debug builds, get the debug counter info and adjust if need be
552#ifndef NDEBUG
553 if (DebugCounter::isCounterSet(ForceExpCounter) &&
554 DebugCounter::shouldExecute(ForceExpCounter)) {
555 ForceEmitWaitcnt[EXP_CNT] = true;
556 } else {
557 ForceEmitWaitcnt[EXP_CNT] = false;
558 }
559
560 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
561 DebugCounter::shouldExecute(ForceLgkmCounter)) {
562 ForceEmitWaitcnt[DS_CNT] = true;
563 ForceEmitWaitcnt[KM_CNT] = true;
564 } else {
565 ForceEmitWaitcnt[DS_CNT] = false;
566 ForceEmitWaitcnt[KM_CNT] = false;
567 }
568
569 if (DebugCounter::isCounterSet(ForceVMCounter) &&
570 DebugCounter::shouldExecute(ForceVMCounter)) {
571 ForceEmitWaitcnt[LOAD_CNT] = true;
572 ForceEmitWaitcnt[SAMPLE_CNT] = true;
573 ForceEmitWaitcnt[BVH_CNT] = true;
574 } else {
575 ForceEmitWaitcnt[LOAD_CNT] = false;
576 ForceEmitWaitcnt[SAMPLE_CNT] = false;
577 ForceEmitWaitcnt[BVH_CNT] = false;
578 }
579
580 ForceEmitWaitcnt[VA_VDST] = false;
581 ForceEmitWaitcnt[VM_VSRC] = false;
582#endif // NDEBUG
583 }
584
585 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
586 // instruction.
587 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
588 switch (Inst.getOpcode()) {
589 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
590 case AMDGPU::GLOBAL_INV:
591 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
592 // VGPRs
593 case AMDGPU::GLOBAL_WB:
594 case AMDGPU::GLOBAL_WBINV:
595 return VMEM_WRITE_ACCESS; // tracked using storecnt
596 default:
597 break;
598 }
599
600 // Maps VMEM access types to their corresponding WaitEventType.
601 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
602 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
603
605 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
606 // these should use VM_CNT.
607 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
608 return VMEM_ACCESS;
609 if (Inst.mayStore() &&
610 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
611 if (TII->mayAccessScratch(Inst))
612 return SCRATCH_WRITE_ACCESS;
613 return VMEM_WRITE_ACCESS;
614 }
615 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
616 return VMEM_ACCESS;
617 return VmemReadMapping[getVmemType(Inst)];
618 }
619
620 std::optional<WaitEventType>
621 getExpertSchedulingEventType(const MachineInstr &Inst) const;
622
623 bool isVmemAccess(const MachineInstr &MI) const;
624 bool generateWaitcntInstBefore(MachineInstr &MI,
625 WaitcntBrackets &ScoreBrackets,
626 MachineInstr *OldWaitcntInstr,
627 bool FlushVmCnt);
628 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
630 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
631 MachineInstr *OldWaitcntInstr);
632 void updateEventWaitcntAfter(MachineInstr &Inst,
633 WaitcntBrackets *ScoreBrackets);
634 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
635 MachineBasicBlock *Block) const;
636 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
637 WaitcntBrackets &ScoreBrackets);
638 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
639 WaitcntBrackets &ScoreBrackets);
640 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
641 bool ExpertMode) const;
642};
643
644// This objects maintains the current score brackets of each wait counter, and
645// a per-register scoreboard for each wait counter.
646//
647// We also maintain the latest score for every event type that can change the
648// waitcnt in order to know if there are multiple types of events within
649// the brackets. When multiple types of event happen in the bracket,
650// wait count may get decreased out of order, therefore we need to put in
651// "s_waitcnt 0" before use.
652class WaitcntBrackets {
653public:
654 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
655 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
656 }
657
658#ifndef NDEBUG
659 ~WaitcntBrackets() {
660 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
661 for (auto &[ID, Val] : VMem) {
662 if (Val.empty())
663 ++NumUnusedVmem;
664 }
665 for (auto &[ID, Val] : SGPRs) {
666 if (Val.empty())
667 ++NumUnusedSGPRs;
668 }
669
670 if (NumUnusedVmem || NumUnusedSGPRs) {
671 errs() << "WaitcntBracket had unused entries at destruction time: "
672 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
673 << " SGPR unused entries\n";
674 std::abort();
675 }
676 }
677#endif
678
679 bool isSmemCounter(InstCounterType T) const {
680 return T == Context->SmemAccessCounter || T == X_CNT;
681 }
682
683 unsigned getSgprScoresIdx(InstCounterType T) const {
684 assert(isSmemCounter(T) && "Invalid SMEM counter");
685 return T == X_CNT ? 1 : 0;
686 }
687
688 unsigned getScoreLB(InstCounterType T) const {
689 assert(T < NUM_INST_CNTS);
690 return ScoreLBs[T];
691 }
692
693 unsigned getScoreUB(InstCounterType T) const {
694 assert(T < NUM_INST_CNTS);
695 return ScoreUBs[T];
696 }
697
698 unsigned getScoreRange(InstCounterType T) const {
699 return getScoreUB(T) - getScoreLB(T);
700 }
701
702 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
703 auto It = SGPRs.find(RU);
704 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
705 }
706
707 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
708 auto It = VMem.find(TID);
709 return It != VMem.end() ? It->second.Scores[T] : 0;
710 }
711
712 bool merge(const WaitcntBrackets &Other);
713
714 bool counterOutOfOrder(InstCounterType T) const;
715 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
716 simplifyWaitcnt(Wait, Wait);
717 }
718 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
719 AMDGPU::Waitcnt &UpdateWait) const;
720 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
721 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
722 AMDGPU::Waitcnt &UpdateWait) const;
723 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
724 AMDGPU::Waitcnt &UpdateWait) const;
725
726 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
727 AMDGPU::Waitcnt &Wait) const;
728 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
729 AMDGPU::Waitcnt &Wait) const;
730 void tryClearSCCWriteEvent(MachineInstr *Inst);
731
732 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
733 void applyWaitcnt(InstCounterType T, unsigned Count);
734 void updateByEvent(WaitEventType E, MachineInstr &MI);
735
736 unsigned hasPendingEvent() const { return PendingEvents; }
737 unsigned hasPendingEvent(WaitEventType E) const {
738 return PendingEvents & (1 << E);
739 }
740 unsigned hasPendingEvent(InstCounterType T) const {
741 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
742 assert((HasPending != 0) == (getScoreRange(T) != 0));
743 return HasPending;
744 }
745
746 bool hasMixedPendingEvents(InstCounterType T) const {
747 unsigned Events = hasPendingEvent(T);
748 // Return true if more than one bit is set in Events.
749 return Events & (Events - 1);
750 }
751
752 bool hasPendingFlat() const {
753 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
754 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
755 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
756 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
757 }
758
759 void setPendingFlat() {
760 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
761 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
762 }
763
764 bool hasPendingGDS() const {
765 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
766 }
767
768 unsigned getPendingGDSWait() const {
769 return std::min(getScoreUB(DS_CNT) - LastGDS,
770 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
771 }
772
773 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
774
775 // Return true if there might be pending writes to the vgpr-interval by VMEM
776 // instructions with types different from V.
777 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
778 for (MCRegUnit RU : regunits(Reg)) {
779 auto It = VMem.find(toVMEMID(RU));
780 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
781 return true;
782 }
783 return false;
784 }
785
786 void clearVgprVmemTypes(MCPhysReg Reg) {
787 for (MCRegUnit RU : regunits(Reg)) {
788 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
789 It->second.VMEMTypes = 0;
790 if (It->second.empty())
791 VMem.erase(It);
792 }
793 }
794 }
795
796 void setStateOnFunctionEntryOrReturn() {
797 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
798 getWaitCountMax(Context->getLimits(), STORE_CNT));
799 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
800 }
801
802 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
803 return LDSDMAStores;
804 }
805
806 bool hasPointSampleAccel(const MachineInstr &MI) const;
807 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
808 MCPhysReg RU) const;
809
810 void print(raw_ostream &) const;
811 void dump() const { print(dbgs()); }
812
813 // Free up memory by removing empty entries from the DenseMap that track event
814 // scores.
815 void purgeEmptyTrackingData();
816
817private:
818 struct MergeInfo {
819 unsigned OldLB;
820 unsigned OtherLB;
821 unsigned MyShift;
822 unsigned OtherShift;
823 };
824
825 void determineWaitForScore(InstCounterType T, unsigned Score,
826 AMDGPU::Waitcnt &Wait) const;
827
828 static bool mergeScore(const MergeInfo &M, unsigned &Score,
829 unsigned OtherScore);
830
832 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
833 if (!Context->TRI->isInAllocatableClass(Reg))
834 return {{}, {}};
835 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
836 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
837 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
838 Reg = Context->TRI->get32BitRegister(Reg);
839 return Context->TRI->regunits(Reg);
840 }
841
842 void setScoreLB(InstCounterType T, unsigned Val) {
843 assert(T < NUM_INST_CNTS);
844 ScoreLBs[T] = Val;
845 }
846
847 void setScoreUB(InstCounterType T, unsigned Val) {
848 assert(T < NUM_INST_CNTS);
849 ScoreUBs[T] = Val;
850
851 if (T != EXP_CNT)
852 return;
853
854 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
855 ScoreLBs[EXP_CNT] =
856 ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
857 }
858
859 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
860 const SIRegisterInfo *TRI = Context->TRI;
861 if (Reg == AMDGPU::SCC) {
862 SCCScore = Val;
863 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
864 for (MCRegUnit RU : regunits(Reg))
865 VMem[toVMEMID(RU)].Scores[T] = Val;
866 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
867 auto STy = getSgprScoresIdx(T);
868 for (MCRegUnit RU : regunits(Reg))
869 SGPRs[RU].Scores[STy] = Val;
870 } else {
871 llvm_unreachable("Register cannot be tracked/unknown register!");
872 }
873 }
874
875 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
876 VMem[TID].Scores[T] = Val;
877 }
878
879 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
880 unsigned Val);
881
882 const SIInsertWaitcnts *Context;
883
884 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
885 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
886 unsigned PendingEvents = 0;
887 // Remember the last flat memory operation.
888 unsigned LastFlat[NUM_INST_CNTS] = {0};
889 // Remember the last GDS operation.
890 unsigned LastGDS = 0;
891
892 // The score tracking logic is fragmented as follows:
893 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
894 // - SGPRs: SGPR RegUnits
895 // - SCC: Non-allocatable and not general purpose: not a SGPR.
896 //
897 // For the VMem case, if the key is within the range of LDS DMA IDs,
898 // then the corresponding index into the `LDSDMAStores` vector below is:
899 // Key - LDSDMA_BEGIN - 1
900 // This is because LDSDMA_BEGIN is a generic entry and does not have an
901 // associated MachineInstr.
902 //
903 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
904
905 struct VMEMInfo {
906 // Scores for all instruction counters.
907 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
908 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
909 unsigned VMEMTypes = 0;
910
911 bool empty() const {
912 return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;
913 }
914 };
915
916 struct SGPRInfo {
917 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
918 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
919 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
920 // the X_CNT score.
921 std::array<unsigned, 2> Scores = {0};
922
923 bool empty() const { return !Scores[0] && !Scores[1]; }
924 };
925
926 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
927 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
928
929 // Reg score for SCC.
930 unsigned SCCScore = 0;
931 // The unique instruction that has an SCC write pending, if there is one.
932 const MachineInstr *PendingSCCWrite = nullptr;
933
934 // Store representative LDS DMA operations. The only useful info here is
935 // alias info. One store is kept per unique AAInfo.
936 SmallVector<const MachineInstr *> LDSDMAStores;
937};
938
939class SIInsertWaitcntsLegacy : public MachineFunctionPass {
940public:
941 static char ID;
942 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
943
944 bool runOnMachineFunction(MachineFunction &MF) override;
945
946 StringRef getPassName() const override {
947 return "SI insert wait instructions";
948 }
949
950 void getAnalysisUsage(AnalysisUsage &AU) const override {
951 AU.setPreservesCFG();
952 AU.addRequired<MachineLoopInfoWrapperPass>();
953 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
954 AU.addUsedIfAvailable<AAResultsWrapperPass>();
955 AU.addPreserved<AAResultsWrapperPass>();
957 }
958};
959
960} // end anonymous namespace
961
962void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
963 InstCounterType CntTy, unsigned Score) {
964 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
965}
966
967// Return true if the subtarget is one that enables Point Sample Acceleration
968// and the MachineInstr passed in is one to which it might be applied (the
969// hardware makes this decision based on several factors, but we can't determine
970// this at compile time, so we have to assume it might be applied if the
971// instruction supports it).
972bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
973 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
974 return false;
975
976 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
977 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
979 return BaseInfo->PointSampleAccel;
980}
981
982// Return true if the subtarget enables Point Sample Acceleration, the supplied
983// MachineInstr is one to which it might be applied and the supplied interval is
984// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
985// (this is the type that a point sample accelerated instruction effectively
986// becomes)
987bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
988 MCPhysReg Reg) const {
989 if (!hasPointSampleAccel(MI))
990 return false;
991
992 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
993}
994
995void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
996 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
997 assert(T < Context->MaxCounter);
998
999 unsigned UB = getScoreUB(T);
1000 unsigned CurrScore = UB + 1;
1001 if (CurrScore == 0)
1002 report_fatal_error("InsertWaitcnt score wraparound");
1003 // PendingEvents and ScoreUB need to be update regardless if this event
1004 // changes the score of a register or not.
1005 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1006 PendingEvents |= 1 << E;
1007 setScoreUB(T, CurrScore);
1008
1009 const SIRegisterInfo *TRI = Context->TRI;
1010 const MachineRegisterInfo *MRI = Context->MRI;
1011 const SIInstrInfo *TII = Context->TII;
1012
1013 if (T == EXP_CNT) {
1014 // Put score on the source vgprs. If this is a store, just use those
1015 // specific register(s).
1016 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
1017 // All GDS operations must protect their address register (same as
1018 // export.)
1019 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1020 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1021
1022 if (Inst.mayStore()) {
1023 if (const auto *Data0 =
1024 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1025 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1026 if (const auto *Data1 =
1027 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1028 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1029 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1030 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1031 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1032 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1033 for (const MachineOperand &Op : Inst.all_uses()) {
1034 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1035 setScoreByOperand(Op, EXP_CNT, CurrScore);
1036 }
1037 }
1038 } else if (TII->isFLAT(Inst)) {
1039 if (Inst.mayStore()) {
1040 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1041 EXP_CNT, CurrScore);
1042 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1043 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1044 EXP_CNT, CurrScore);
1045 }
1046 } else if (TII->isMIMG(Inst)) {
1047 if (Inst.mayStore()) {
1048 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1049 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1050 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1051 EXP_CNT, CurrScore);
1052 }
1053 } else if (TII->isMTBUF(Inst)) {
1054 if (Inst.mayStore())
1055 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1056 } else if (TII->isMUBUF(Inst)) {
1057 if (Inst.mayStore()) {
1058 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1059 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1060 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1061 EXP_CNT, CurrScore);
1062 }
1063 } else if (TII->isLDSDIR(Inst)) {
1064 // LDSDIR instructions attach the score to the destination.
1065 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1066 EXP_CNT, CurrScore);
1067 } else {
1068 if (TII->isEXP(Inst)) {
1069 // For export the destination registers are really temps that
1070 // can be used as the actual source after export patching, so
1071 // we need to treat them like sources and set the EXP_CNT
1072 // score.
1073 for (MachineOperand &DefMO : Inst.all_defs()) {
1074 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1075 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1076 }
1077 }
1078 }
1079 for (const MachineOperand &Op : Inst.all_uses()) {
1080 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1081 setScoreByOperand(Op, EXP_CNT, CurrScore);
1082 }
1083 }
1084 } else if (T == X_CNT) {
1085 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1086 if (PendingEvents & (1 << OtherEvent)) {
1087 // Hardware inserts an implicit xcnt between interleaved
1088 // SMEM and VMEM operations. So there will never be
1089 // outstanding address translations for both SMEM and
1090 // VMEM at the same time.
1091 setScoreLB(T, getScoreUB(T) - 1);
1092 PendingEvents &= ~(1 << OtherEvent);
1093 }
1094 for (const MachineOperand &Op : Inst.all_uses())
1095 setScoreByOperand(Op, T, CurrScore);
1096 } else if (T == VA_VDST || T == VM_VSRC) {
1097 // Match the score to the VGPR destination or source registers as
1098 // appropriate
1099 for (const MachineOperand &Op : Inst.operands()) {
1100 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1101 (T == VM_VSRC && Op.isDef()))
1102 continue;
1103 if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
1104 setScoreByOperand(Op, T, CurrScore);
1105 }
1106 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1107 // Match the score to the destination registers.
1108 //
1109 // Check only explicit operands. Stores, especially spill stores, include
1110 // implicit uses and defs of their super registers which would create an
1111 // artificial dependency, while these are there only for register liveness
1112 // accounting purposes.
1113 //
1114 // Special cases where implicit register defs exists, such as M0 or VCC,
1115 // but none with memory instructions.
1116 for (const MachineOperand &Op : Inst.defs()) {
1117 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1118 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1119 continue;
1120 if (updateVMCntOnly(Inst)) {
1121 // updateVMCntOnly should only leave us with VGPRs
1122 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1123 // defs. That's required for a sane index into `VgprMemTypes` below
1124 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1125 VmemType V = getVmemType(Inst);
1126 unsigned char TypesMask = 1 << V;
1127 // If instruction can have Point Sample Accel applied, we have to flag
1128 // this with another potential dependency
1129 if (hasPointSampleAccel(Inst))
1130 TypesMask |= 1 << VMEM_NOSAMPLER;
1131 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1132 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1133 }
1134 }
1135 setScoreByOperand(Op, T, CurrScore);
1136 }
1137 if (Inst.mayStore() &&
1138 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1139 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1140 // written can be accessed. A load from LDS to VMEM does not need a wait.
1141 //
1142 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1143 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1144 // store. The "Slot" is the index into LDSDMAStores + 1.
1145 unsigned Slot = 0;
1146 for (const auto *MemOp : Inst.memoperands()) {
1147 if (!MemOp->isStore() ||
1148 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1149 continue;
1150 // Comparing just AA info does not guarantee memoperands are equal
1151 // in general, but this is so for LDS DMA in practice.
1152 auto AAI = MemOp->getAAInfo();
1153 // Alias scope information gives a way to definitely identify an
1154 // original memory object and practically produced in the module LDS
1155 // lowering pass. If there is no scope available we will not be able
1156 // to disambiguate LDS aliasing as after the module lowering all LDS
1157 // is squashed into a single big object.
1158 if (!AAI || !AAI.Scope)
1159 break;
1160 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1161 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1162 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1163 Slot = I + 1;
1164 break;
1165 }
1166 }
1167 }
1168 if (Slot)
1169 break;
1170 // The slot may not be valid because it can be >= NUM_LDSDMA which
1171 // means the scoreboard cannot track it. We still want to preserve the
1172 // MI in order to check alias information, though.
1173 LDSDMAStores.push_back(&Inst);
1174 Slot = LDSDMAStores.size();
1175 break;
1176 }
1177 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1178 if (Slot && Slot < NUM_LDSDMA)
1179 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1180 }
1181
1183 setRegScore(AMDGPU::SCC, T, CurrScore);
1184 PendingSCCWrite = &Inst;
1185 }
1186 }
1187}
1188
1189void WaitcntBrackets::print(raw_ostream &OS) const {
1190 const GCNSubtarget *ST = Context->ST;
1191
1192 OS << '\n';
1193 for (auto T : inst_counter_types(Context->MaxCounter)) {
1194 unsigned SR = getScoreRange(T);
1195
1196 switch (T) {
1197 case LOAD_CNT:
1198 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1199 << SR << "):";
1200 break;
1201 case DS_CNT:
1202 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1203 << SR << "):";
1204 break;
1205 case EXP_CNT:
1206 OS << " EXP_CNT(" << SR << "):";
1207 break;
1208 case STORE_CNT:
1209 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1210 << SR << "):";
1211 break;
1212 case SAMPLE_CNT:
1213 OS << " SAMPLE_CNT(" << SR << "):";
1214 break;
1215 case BVH_CNT:
1216 OS << " BVH_CNT(" << SR << "):";
1217 break;
1218 case KM_CNT:
1219 OS << " KM_CNT(" << SR << "):";
1220 break;
1221 case X_CNT:
1222 OS << " X_CNT(" << SR << "):";
1223 break;
1224 case VA_VDST:
1225 OS << " VA_VDST(" << SR << "): ";
1226 break;
1227 case VM_VSRC:
1228 OS << " VM_VSRC(" << SR << "): ";
1229 break;
1230 default:
1231 OS << " UNKNOWN(" << SR << "):";
1232 break;
1233 }
1234
1235 if (SR != 0) {
1236 // Print vgpr scores.
1237 unsigned LB = getScoreLB(T);
1238
1239 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1240 sort(SortedVMEMIDs);
1241
1242 for (auto ID : SortedVMEMIDs) {
1243 unsigned RegScore = VMem.at(ID).Scores[T];
1244 if (RegScore <= LB)
1245 continue;
1246 unsigned RelScore = RegScore - LB - 1;
1247 if (ID < REGUNITS_END) {
1248 OS << ' ' << RelScore << ":vRU" << ID;
1249 } else {
1250 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1251 "Unhandled/unexpected ID value!");
1252 OS << ' ' << RelScore << ":LDSDMA" << ID;
1253 }
1254 }
1255
1256 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1257 if (isSmemCounter(T)) {
1258 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1259 sort(SortedSMEMIDs);
1260 for (auto ID : SortedSMEMIDs) {
1261 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1262 if (RegScore <= LB)
1263 continue;
1264 unsigned RelScore = RegScore - LB - 1;
1265 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1266 }
1267 }
1268
1269 if (T == KM_CNT && SCCScore > 0)
1270 OS << ' ' << SCCScore << ":scc";
1271 }
1272 OS << '\n';
1273 }
1274
1275 OS << "Pending Events: ";
1276 if (hasPendingEvent()) {
1277 ListSeparator LS;
1278 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1279 if (hasPendingEvent((WaitEventType)I)) {
1280 OS << LS << WaitEventTypeName[I];
1281 }
1282 }
1283 } else {
1284 OS << "none";
1285 }
1286 OS << '\n';
1287
1288 OS << '\n';
1289}
1290
1291/// Simplify \p UpdateWait by removing waits that are redundant based on the
1292/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1293void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1294 AMDGPU::Waitcnt &UpdateWait) const {
1295 simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
1296 simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
1297 simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
1298 simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
1299 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
1300 simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
1301 simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
1302 simplifyXcnt(CheckWait, UpdateWait);
1303 simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
1304 simplifyVmVsrc(CheckWait, UpdateWait);
1305}
1306
1307void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1308 unsigned &Count) const {
1309 // The number of outstanding events for this type, T, can be calculated
1310 // as (UB - LB). If the current Count is greater than or equal to the number
1311 // of outstanding events, then the wait for this counter is redundant.
1312 if (Count >= getScoreRange(T))
1313 Count = ~0u;
1314}
1315
1316void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1317 AMDGPU::Waitcnt &UpdateWait) const {
1318 // Waiting for some counters implies waiting for VM_VSRC, since an
1319 // instruction that decrements a counter on completion would have
1320 // decremented VM_VSRC once its VGPR operands had been read.
1321 if (CheckWait.VmVsrc >=
1322 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1323 CheckWait.BvhCnt, CheckWait.DsCnt}))
1324 UpdateWait.VmVsrc = ~0u;
1325 simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
1326}
1327
1328void WaitcntBrackets::purgeEmptyTrackingData() {
1329 for (auto &[K, V] : make_early_inc_range(VMem)) {
1330 if (V.empty())
1331 VMem.erase(K);
1332 }
1333 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1334 if (V.empty())
1335 SGPRs.erase(K);
1336 }
1337}
1338
1339void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1340 unsigned ScoreToWait,
1341 AMDGPU::Waitcnt &Wait) const {
1342 const unsigned LB = getScoreLB(T);
1343 const unsigned UB = getScoreUB(T);
1344
1345 // If the score falls within the bracket, we need a waitcnt.
1346 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1347 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1348 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1349 // If there is a pending FLAT operation, and this is a VMem or LGKM
1350 // waitcnt and the target can report early completion, then we need
1351 // to force a waitcnt 0.
1352 addWait(Wait, T, 0);
1353 } else if (counterOutOfOrder(T)) {
1354 // Counter can get decremented out-of-order when there
1355 // are multiple types event in the bracket. Also emit an s_wait counter
1356 // with a conservative value of 0 for the counter.
1357 addWait(Wait, T, 0);
1358 } else {
1359 // If a counter has been maxed out avoid overflow by waiting for
1360 // MAX(CounterType) - 1 instead.
1361 unsigned NeededWait = std::min(
1362 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1363 addWait(Wait, T, NeededWait);
1364 }
1365 }
1366}
1367
1368void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1369 AMDGPU::Waitcnt &Wait) const {
1370 if (Reg == AMDGPU::SCC) {
1371 determineWaitForScore(T, SCCScore, Wait);
1372 } else {
1373 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1374 for (MCRegUnit RU : regunits(Reg))
1375 determineWaitForScore(
1376 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1377 Wait);
1378 }
1379}
1380
1381void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1382 AMDGPU::Waitcnt &Wait) const {
1383 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1384 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1385}
1386
1387void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1388 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1389 // SCC has landed
1390 if (PendingSCCWrite &&
1391 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1392 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1393 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1394 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1395 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1396 SCC_WRITE_PendingEvent) {
1397 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1398 }
1399
1400 PendingEvents &= ~SCC_WRITE_PendingEvent;
1401 PendingSCCWrite = nullptr;
1402 }
1403}
1404
1405void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1406 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1407 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1408 applyWaitcnt(DS_CNT, Wait.DsCnt);
1409 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1410 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1411 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1412 applyWaitcnt(KM_CNT, Wait.KmCnt);
1413 applyWaitcnt(X_CNT, Wait.XCnt);
1414 applyWaitcnt(VA_VDST, Wait.VaVdst);
1415 applyWaitcnt(VM_VSRC, Wait.VmVsrc);
1416}
1417
1418void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1419 const unsigned UB = getScoreUB(T);
1420 if (Count >= UB)
1421 return;
1422 if (Count != 0) {
1423 if (counterOutOfOrder(T))
1424 return;
1425 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1426 } else {
1427 setScoreLB(T, UB);
1428 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1429 }
1430
1431 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1432 if (!hasMixedPendingEvents(X_CNT))
1433 applyWaitcnt(X_CNT, 0);
1434 else
1435 PendingEvents &= ~(1 << SMEM_GROUP);
1436 }
1437 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1438 !hasPendingEvent(STORE_CNT)) {
1439 if (!hasMixedPendingEvents(X_CNT))
1440 applyWaitcnt(X_CNT, Count);
1441 else if (Count == 0)
1442 PendingEvents &= ~(1 << VMEM_GROUP);
1443 }
1444}
1445
1446void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1447 AMDGPU::Waitcnt &UpdateWait) const {
1448 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1449 // optimizations. On entry to a block with multiple predescessors, there may
1450 // be pending SMEM and VMEM events active at the same time.
1451 // In such cases, only clear one active event at a time.
1452 // TODO: Revisit xcnt optimizations for gfx1250.
1453 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1454 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1455 // zero.
1456 if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1457 UpdateWait.XCnt = ~0u;
1458 // If we have pending store we cannot optimize XCnt because we do not wait for
1459 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1460 // decremented to the same number as LOADCnt.
1461 if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1462 !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
1463 UpdateWait.XCnt = ~0u;
1464 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1465}
1466
1467// Where there are multiple types of event in the bracket of a counter,
1468// the decrement may go out of order.
1469bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1470 // Scalar memory read always can go out of order.
1471 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1472 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1473 return true;
1474
1475 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1476 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1477 // out-of-order completion.
1478 if (T == LOAD_CNT) {
1479 unsigned Events = hasPendingEvent(T);
1480 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1481 // events
1482 Events &= ~(1 << GLOBAL_INV_ACCESS);
1483 // Return true only if there are still multiple event types after removing
1484 // GLOBAL_INV
1485 return Events & (Events - 1);
1486 }
1487
1488 return hasMixedPendingEvents(T);
1489}
1490
1491INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1492 false, false)
1495INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1497
1498char SIInsertWaitcntsLegacy::ID = 0;
1499
1500char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1501
1503 return new SIInsertWaitcntsLegacy();
1504}
1505
1506static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1507 unsigned NewEnc) {
1508 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1509 assert(OpIdx >= 0);
1510
1511 MachineOperand &MO = MI.getOperand(OpIdx);
1512
1513 if (NewEnc == MO.getImm())
1514 return false;
1515
1516 MO.setImm(NewEnc);
1517 return true;
1518}
1519
1520/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1521/// and if so, which counter it is waiting on.
1522static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1523 switch (Opcode) {
1524 case AMDGPU::S_WAIT_LOADCNT:
1525 return LOAD_CNT;
1526 case AMDGPU::S_WAIT_EXPCNT:
1527 return EXP_CNT;
1528 case AMDGPU::S_WAIT_STORECNT:
1529 return STORE_CNT;
1530 case AMDGPU::S_WAIT_SAMPLECNT:
1531 return SAMPLE_CNT;
1532 case AMDGPU::S_WAIT_BVHCNT:
1533 return BVH_CNT;
1534 case AMDGPU::S_WAIT_DSCNT:
1535 return DS_CNT;
1536 case AMDGPU::S_WAIT_KMCNT:
1537 return KM_CNT;
1538 case AMDGPU::S_WAIT_XCNT:
1539 return X_CNT;
1540 default:
1541 return {};
1542 }
1543}
1544
1545bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1546 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1547 if (Opcode == Waitcnt->getOpcode())
1548 return false;
1549
1550 Waitcnt->setDesc(TII->get(Opcode));
1551 return true;
1552}
1553
1554/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1555/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1556/// from \p Wait that were added by previous passes. Currently this pass
1557/// conservatively assumes that these preexisting waits are required for
1558/// correctness.
1559bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1560 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1561 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1562 assert(ST);
1563 assert(isNormalMode(MaxCounter));
1564
1565 bool Modified = false;
1566 MachineInstr *WaitcntInstr = nullptr;
1567 MachineInstr *WaitcntVsCntInstr = nullptr;
1568
1569 LLVM_DEBUG({
1570 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1571 if (It.isEnd())
1572 dbgs() << "end of block\n";
1573 else
1574 dbgs() << *It;
1575 });
1576
1577 for (auto &II :
1578 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1579 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1580 if (II.isMetaInstruction()) {
1581 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1582 continue;
1583 }
1584
1585 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1586 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1587
1588 // Update required wait count. If this is a soft waitcnt (= it was added
1589 // by an earlier pass), it may be entirely removed.
1590 if (Opcode == AMDGPU::S_WAITCNT) {
1591 unsigned IEnc = II.getOperand(0).getImm();
1592 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1593 if (TrySimplify)
1594 ScoreBrackets.simplifyWaitcnt(OldWait);
1595 Wait = Wait.combined(OldWait);
1596
1597 // Merge consecutive waitcnt of the same type by erasing multiples.
1598 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1599 II.eraseFromParent();
1600 Modified = true;
1601 } else
1602 WaitcntInstr = &II;
1603 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1604 assert(ST->hasVMemToLDSLoad());
1605 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1606 << "Before: " << Wait << '\n';);
1607 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1608 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1609
1610 // It is possible (but unlikely) that this is the only wait instruction,
1611 // in which case, we exit this loop without a WaitcntInstr to consume
1612 // `Wait`. But that works because `Wait` was passed in by reference, and
1613 // the callee eventually calls createNewWaitcnt on it. We test this
1614 // possibility in an articial MIR test since such a situation cannot be
1615 // recreated by running the memory legalizer.
1616 II.eraseFromParent();
1617 } else {
1618 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1619 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1620
1621 unsigned OldVSCnt =
1622 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1623 if (TrySimplify)
1624 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1625 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1626
1627 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1628 II.eraseFromParent();
1629 Modified = true;
1630 } else
1631 WaitcntVsCntInstr = &II;
1632 }
1633 }
1634
1635 if (WaitcntInstr) {
1636 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1638 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1639
1640 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1641 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1642 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1643 Wait.LoadCnt = ~0u;
1644 Wait.ExpCnt = ~0u;
1645 Wait.DsCnt = ~0u;
1646
1647 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1648 << "New Instr at block end: "
1649 << *WaitcntInstr << '\n'
1650 : dbgs() << "applied pre-existing waitcnt\n"
1651 << "Old Instr: " << *It
1652 << "New Instr: " << *WaitcntInstr << '\n');
1653 }
1654
1655 if (WaitcntVsCntInstr) {
1656 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1657 AMDGPU::OpName::simm16, Wait.StoreCnt);
1658 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1659
1660 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1661 Wait.StoreCnt = ~0u;
1662
1663 LLVM_DEBUG(It.isEnd()
1664 ? dbgs() << "applied pre-existing waitcnt\n"
1665 << "New Instr at block end: " << *WaitcntVsCntInstr
1666 << '\n'
1667 : dbgs() << "applied pre-existing waitcnt\n"
1668 << "Old Instr: " << *It
1669 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1670 }
1671
1672 return Modified;
1673}
1674
1675/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1676/// required counters in \p Wait
1677bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1678 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1679 AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
1680 assert(ST);
1681 assert(isNormalMode(MaxCounter));
1682
1683 bool Modified = false;
1684 const DebugLoc &DL = Block.findDebugLoc(It);
1685
1686 // Helper to emit expanded waitcnt sequence for profiling.
1687 // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
1688 // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
1689 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1690 auto EmitWaitcnt) {
1691 if (Outstanding > Target) {
1692 for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
1693 EmitWaitcnt(i);
1694 Modified = true;
1695 }
1696 } else {
1697 EmitWaitcnt(Target);
1698 Modified = true;
1699 }
1700 };
1701
1702 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1703 // single instruction while VScnt has its own instruction.
1704 if (Wait.hasWaitExceptStoreCnt()) {
1705 // If profiling expansion is enabled and we have score brackets,
1706 // emit an expanded sequence
1707 if (ExpandWaitcntProfiling && ScoreBrackets) {
1708 // Check if any of the counters to be waited on are out-of-order.
1709 // If so, fall back to normal (non-expanded) behavior since expansion
1710 // would provide misleading profiling information.
1711 bool AnyOutOfOrder = false;
1712 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1713 unsigned &WaitCnt = getCounterRef(Wait, CT);
1714 if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
1715 AnyOutOfOrder = true;
1716 break;
1717 }
1718 }
1719
1720 if (AnyOutOfOrder) {
1721 // Fall back to non-expanded wait
1722 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1723 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1724 Modified = true;
1725 } else {
1726 // All counters are in-order, safe to expand
1727 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1728 unsigned &WaitCnt = getCounterRef(Wait, CT);
1729 if (WaitCnt == ~0u)
1730 continue;
1731
1732 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
1733 ScoreBrackets->getScoreLB(CT),
1734 getWaitCountMax(getLimits(), CT) - 1);
1735 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1736 AMDGPU::Waitcnt W;
1737 getCounterRef(W, CT) = Count;
1738 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
1740 });
1741 }
1742 }
1743 } else {
1744 // Normal behavior: emit single combined waitcnt
1745 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1746 [[maybe_unused]] auto SWaitInst =
1747 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1748 Modified = true;
1749
1750 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1751 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1752 dbgs() << "New Instr: " << *SWaitInst << '\n');
1753 }
1754 }
1755
1756 if (Wait.hasWaitStoreCnt()) {
1757 assert(ST->hasVscnt());
1758
1759 if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
1760 !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
1761 // Only expand if counter is not out-of-order
1762 unsigned Outstanding =
1763 std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
1764 ScoreBrackets->getScoreLB(STORE_CNT),
1765 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1766 EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
1767 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1768 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1769 .addImm(Count);
1770 });
1771 } else {
1772 [[maybe_unused]] auto SWaitInst =
1773 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1774 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1775 .addImm(Wait.StoreCnt);
1776 Modified = true;
1777
1778 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1779 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1780 dbgs() << "New Instr: " << *SWaitInst << '\n');
1781 }
1782 }
1783
1784 return Modified;
1785}
1786
1787AMDGPU::Waitcnt
1788WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1789 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1790}
1791
1792AMDGPU::Waitcnt
1793WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1794 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1795 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1796 ~0u /* XCNT */, ExpertVal, ExpertVal);
1797}
1798
1799/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1800/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1801/// were added by previous passes. Currently this pass conservatively
1802/// assumes that these preexisting waits are required for correctness.
1803bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1804 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1805 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1806 assert(ST);
1807 assert(!isNormalMode(MaxCounter));
1808
1809 bool Modified = false;
1810 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1811 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1812 MachineInstr *WaitcntDepctrInstr = nullptr;
1813 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1814
1815 LLVM_DEBUG({
1816 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1817 if (It.isEnd())
1818 dbgs() << "end of block\n";
1819 else
1820 dbgs() << *It;
1821 });
1822
1823 // Accumulate waits that should not be simplified.
1824 AMDGPU::Waitcnt RequiredWait;
1825
1826 for (auto &II :
1827 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1828 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1829 if (II.isMetaInstruction()) {
1830 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1831 continue;
1832 }
1833
1834 MachineInstr **UpdatableInstr;
1835
1836 // Update required wait count. If this is a soft waitcnt (= it was added
1837 // by an earlier pass), it may be entirely removed.
1838
1839 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1840 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1841
1842 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1843 // attempt to do more than that either.
1844 if (Opcode == AMDGPU::S_WAITCNT)
1845 continue;
1846
1847 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1848 unsigned OldEnc =
1849 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1850 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1851 if (TrySimplify)
1852 Wait = Wait.combined(OldWait);
1853 else
1854 RequiredWait = RequiredWait.combined(OldWait);
1855 UpdatableInstr = &CombinedLoadDsCntInstr;
1856 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1857 unsigned OldEnc =
1858 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1859 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1860 if (TrySimplify)
1861 Wait = Wait.combined(OldWait);
1862 else
1863 RequiredWait = RequiredWait.combined(OldWait);
1864 UpdatableInstr = &CombinedStoreDsCntInstr;
1865 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1866 unsigned OldEnc =
1867 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1868 AMDGPU::Waitcnt OldWait;
1869 OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
1870 OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
1871 if (TrySimplify)
1872 ScoreBrackets.simplifyWaitcnt(OldWait);
1873 Wait = Wait.combined(OldWait);
1874 UpdatableInstr = &WaitcntDepctrInstr;
1875 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1876 // Architectures higher than GFX10 do not have direct loads to
1877 // LDS, so no work required here yet.
1878 II.eraseFromParent();
1879 continue;
1880 } else {
1881 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1882 assert(CT.has_value());
1883 unsigned OldCnt =
1884 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1885 if (TrySimplify)
1886 addWait(Wait, CT.value(), OldCnt);
1887 else
1888 addWait(RequiredWait, CT.value(), OldCnt);
1889 UpdatableInstr = &WaitInstrs[CT.value()];
1890 }
1891
1892 // Merge consecutive waitcnt of the same type by erasing multiples.
1893 if (!*UpdatableInstr) {
1894 *UpdatableInstr = &II;
1895 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1896 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1897 // duplicate if it is waiting on things other than VA_VDST or
1898 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1899 // VM_VSRC subfields of the operand are set to the "no wait"
1900 // values.
1901
1902 unsigned Enc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1903 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1904 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1905
1906 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
1907 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1908 Modified |= promoteSoftWaitCnt(&II);
1909 } else {
1910 II.eraseFromParent();
1911 Modified = true;
1912 }
1913 } else {
1914 II.eraseFromParent();
1915 Modified = true;
1916 }
1917 }
1918
1919 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1920 Wait = Wait.combined(RequiredWait);
1921
1922 if (CombinedLoadDsCntInstr) {
1923 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1924 // to be waited for. Otherwise, let the instruction be deleted so
1925 // the appropriate single counter wait instruction can be inserted
1926 // instead, when new S_WAIT_*CNT instructions are inserted by
1927 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1928 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1929 // the loop below that deals with single counter instructions.
1930 //
1931 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1932 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1933 // will have needed to wait for their register sources to be available
1934 // first.
1935 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1936 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1937 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1938 AMDGPU::OpName::simm16, NewEnc);
1939 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1940 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1941 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1942 Wait.LoadCnt = ~0u;
1943 Wait.DsCnt = ~0u;
1944
1945 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1946 << "New Instr at block end: "
1947 << *CombinedLoadDsCntInstr << '\n'
1948 : dbgs() << "applied pre-existing waitcnt\n"
1949 << "Old Instr: " << *It << "New Instr: "
1950 << *CombinedLoadDsCntInstr << '\n');
1951 } else {
1952 CombinedLoadDsCntInstr->eraseFromParent();
1953 Modified = true;
1954 }
1955 }
1956
1957 if (CombinedStoreDsCntInstr) {
1958 // Similarly for S_WAIT_STORECNT_DSCNT.
1959 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1960 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1961 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1962 AMDGPU::OpName::simm16, NewEnc);
1963 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1964 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1965 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1966 Wait.StoreCnt = ~0u;
1967 Wait.DsCnt = ~0u;
1968
1969 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1970 << "New Instr at block end: "
1971 << *CombinedStoreDsCntInstr << '\n'
1972 : dbgs() << "applied pre-existing waitcnt\n"
1973 << "Old Instr: " << *It << "New Instr: "
1974 << *CombinedStoreDsCntInstr << '\n');
1975 } else {
1976 CombinedStoreDsCntInstr->eraseFromParent();
1977 Modified = true;
1978 }
1979 }
1980
1981 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1982 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1983 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1984 // instructions so that createNewWaitcnt() will create new combined
1985 // instructions to replace them.
1986
1987 if (Wait.DsCnt != ~0u) {
1988 // This is a vector of addresses in WaitInstrs pointing to instructions
1989 // that should be removed if they are present.
1991
1992 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1993 // both) need to be waited for, ensure that there are no existing
1994 // individual wait count instructions for these.
1995
1996 if (Wait.LoadCnt != ~0u) {
1997 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1998 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1999 } else if (Wait.StoreCnt != ~0u) {
2000 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
2001 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2002 }
2003
2004 for (MachineInstr **WI : WaitsToErase) {
2005 if (!*WI)
2006 continue;
2007
2008 (*WI)->eraseFromParent();
2009 *WI = nullptr;
2010 Modified = true;
2011 }
2012 }
2013
2014 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2015 if (!WaitInstrs[CT])
2016 continue;
2017
2018 unsigned NewCnt = getWait(Wait, CT);
2019 if (NewCnt != ~0u) {
2020 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2021 AMDGPU::OpName::simm16, NewCnt);
2022 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2023
2024 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2025 setNoWait(Wait, CT);
2026
2027 LLVM_DEBUG(It.isEnd()
2028 ? dbgs() << "applied pre-existing waitcnt\n"
2029 << "New Instr at block end: " << *WaitInstrs[CT]
2030 << '\n'
2031 : dbgs() << "applied pre-existing waitcnt\n"
2032 << "Old Instr: " << *It
2033 << "New Instr: " << *WaitInstrs[CT] << '\n');
2034 } else {
2035 WaitInstrs[CT]->eraseFromParent();
2036 Modified = true;
2037 }
2038 }
2039
2040 if (WaitcntDepctrInstr) {
2041 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2042 // subfields with the new required values.
2043 unsigned Enc =
2044 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2045 ->getImm();
2046 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
2047 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2048
2049 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
2050 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
2051 Wait.VaVdst = ~0u;
2052 Wait.VmVsrc = ~0u;
2053
2054 // If that new encoded Depctr immediate would actually still wait
2055 // for anything, update the instruction's operand. Otherwise it can
2056 // just be deleted.
2057 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
2058 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2059 AMDGPU::OpName::simm16, Enc);
2060 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2061 << "New Instr at block end: "
2062 << *WaitcntDepctrInstr << '\n'
2063 : dbgs() << "applyPreexistingWaitcnt\n"
2064 << "Old Instr: " << *It << "New Instr: "
2065 << *WaitcntDepctrInstr << '\n');
2066 } else {
2067 WaitcntDepctrInstr->eraseFromParent();
2068 Modified = true;
2069 }
2070 }
2071
2072 return Modified;
2073}
2074
2075/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2076bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2077 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2078 AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
2079 assert(ST);
2080 assert(!isNormalMode(MaxCounter));
2081
2082 bool Modified = false;
2083 const DebugLoc &DL = Block.findDebugLoc(It);
2084
2085 // Helper to emit expanded waitcnt sequence for profiling.
2086 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2087 auto EmitWaitcnt) {
2088 if (Outstanding > Target) {
2089 for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
2090 EmitWaitcnt(i);
2091 Modified = true;
2092 }
2093 } else {
2094 EmitWaitcnt(Target);
2095 Modified = true;
2096 }
2097 };
2098
2099 // For GFX12+, we use separate wait instructions, which makes expansion
2100 // simpler
2101 if (ExpandWaitcntProfiling && ScoreBrackets) {
2102 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2103 unsigned Count = getWait(Wait, CT);
2104 if (Count == ~0u)
2105 continue;
2106
2107 // Skip expansion for out-of-order counters - emit normal wait instead
2108 if (ScoreBrackets->counterOutOfOrder(CT)) {
2109 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2110 .addImm(Count);
2111 Modified = true;
2112 continue;
2113 }
2114
2115 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
2116 ScoreBrackets->getScoreLB(CT),
2117 getWaitCountMax(getLimits(), CT) - 1);
2118 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2119 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2120 .addImm(Val);
2121 });
2122 }
2123 return Modified;
2124 }
2125
2126 // Normal behavior (no expansion)
2127 // Check for opportunities to use combined wait instructions.
2128 if (Wait.DsCnt != ~0u) {
2129 MachineInstr *SWaitInst = nullptr;
2130
2131 if (Wait.LoadCnt != ~0u) {
2132 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2133
2134 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2135 .addImm(Enc);
2136
2137 Wait.LoadCnt = ~0u;
2138 Wait.DsCnt = ~0u;
2139 } else if (Wait.StoreCnt != ~0u) {
2140 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2141
2142 SWaitInst =
2143 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2144 .addImm(Enc);
2145
2146 Wait.StoreCnt = ~0u;
2147 Wait.DsCnt = ~0u;
2148 }
2149
2150 if (SWaitInst) {
2151 Modified = true;
2152
2153 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2154 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2155 dbgs() << "New Instr: " << *SWaitInst << '\n');
2156 }
2157 }
2158
2159 // Generate an instruction for any remaining counter that needs
2160 // waiting for.
2161
2162 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2163 unsigned Count = getWait(Wait, CT);
2164 if (Count == ~0u)
2165 continue;
2166
2167 [[maybe_unused]] auto SWaitInst =
2168 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2169 .addImm(Count);
2170
2171 Modified = true;
2172
2173 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2174 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2175 dbgs() << "New Instr: " << *SWaitInst << '\n');
2176 }
2177
2178 if (Wait.hasWaitDepctr()) {
2179 assert(IsExpertMode);
2180 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
2181 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2182
2183 [[maybe_unused]] auto SWaitInst =
2184 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2185
2186 Modified = true;
2187
2188 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2189 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2190 dbgs() << "New Instr: " << *SWaitInst << '\n');
2191 }
2192
2193 return Modified;
2194}
2195
2196/// Generate s_waitcnt instruction to be placed before cur_Inst.
2197/// Instructions of a given type are returned in order,
2198/// but instructions of different types can complete out of order.
2199/// We rely on this in-order completion
2200/// and simply assign a score to the memory access instructions.
2201/// We keep track of the active "score bracket" to determine
2202/// if an access of a memory read requires an s_waitcnt
2203/// and if so what the value of each counter is.
2204/// The "score bracket" is bound by the lower bound and upper bound
2205/// scores (*_score_LB and *_score_ub respectively).
2206/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
2207/// flush the vmcnt counter here.
2208bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
2209 WaitcntBrackets &ScoreBrackets,
2210 MachineInstr *OldWaitcntInstr,
2211 bool FlushVmCnt) {
2212 setForceEmitWaitcnt();
2213
2214 assert(!MI.isMetaInstruction());
2215
2216 AMDGPU::Waitcnt Wait;
2217 const unsigned Opc = MI.getOpcode();
2218
2219 // FIXME: This should have already been handled by the memory legalizer.
2220 // Removing this currently doesn't affect any lit tests, but we need to
2221 // verify that nothing was relying on this. The number of buffer invalidates
2222 // being handled here should not be expanded.
2223 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2224 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
2225 Opc == AMDGPU::BUFFER_GL1_INV) {
2226 Wait.LoadCnt = 0;
2227 }
2228
2229 // All waits must be resolved at call return.
2230 // NOTE: this could be improved with knowledge of all call sites or
2231 // with knowledge of the called routines.
2232 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
2233 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2234 Opc == AMDGPU::S_SETPC_B64_return) {
2235 ReturnInsts.insert(&MI);
2236 AMDGPU::Waitcnt AllZeroWait =
2237 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2238 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2239 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2240 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2241 // no need to wait for it at function boundaries.
2242 if (ST->hasExtendedWaitCounts() &&
2243 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2244 AllZeroWait.LoadCnt = ~0u;
2245 Wait = Wait.combined(AllZeroWait);
2246 }
2247 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2248 // Technically the hardware will do this on its own if we don't, but that
2249 // might cost extra cycles compared to doing it explicitly.
2250 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2251 // have to wait for outstanding VMEM stores. In this case it can be useful to
2252 // send a message to explicitly release all VGPRs before the stores have
2253 // completed, but it is only safe to do this if there are no outstanding
2254 // scratch stores.
2255 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
2256 if (!WCG->isOptNone() &&
2257 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
2258 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
2259 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2260 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
2261 ReleaseVGPRInsts.insert(&MI);
2262 }
2263 // Resolve vm waits before gs-done.
2264 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
2265 ST->hasLegacyGeometry() &&
2266 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2268 Wait.LoadCnt = 0;
2269 }
2270
2271 // Export & GDS instructions do not read the EXEC mask until after the export
2272 // is granted (which can occur well after the instruction is issued).
2273 // The shader program must flush all EXP operations on the export-count
2274 // before overwriting the EXEC mask.
2275 else {
2276 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2277 // Export and GDS are tracked individually, either may trigger a waitcnt
2278 // for EXEC.
2279 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2280 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2281 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2282 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2283 Wait.ExpCnt = 0;
2284 }
2285 }
2286
2287 // Wait for any pending GDS instruction to complete before any
2288 // "Always GDS" instruction.
2289 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2290 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2291
2292 if (MI.isCall()) {
2293 // The function is going to insert a wait on everything in its prolog.
2294 // This still needs to be careful if the call target is a load (e.g. a GOT
2295 // load). We also need to check WAW dependency with saved PC.
2296 CallInsts.insert(&MI);
2297 Wait = AMDGPU::Waitcnt();
2298
2299 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2300 if (CallAddrOp.isReg()) {
2301 ScoreBrackets.determineWaitForPhysReg(
2302 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2303
2304 if (const auto *RtnAddrOp =
2305 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2306 ScoreBrackets.determineWaitForPhysReg(
2307 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2308 }
2309 }
2310 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2311 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2312 } else {
2313 // FIXME: Should not be relying on memoperands.
2314 // Look at the source operands of every instruction to see if
2315 // any of them results from a previous memory operation that affects
2316 // its current usage. If so, an s_waitcnt instruction needs to be
2317 // emitted.
2318 // If the source operand was defined by a load, add the s_waitcnt
2319 // instruction.
2320 //
2321 // Two cases are handled for destination operands:
2322 // 1) If the destination operand was defined by a load, add the s_waitcnt
2323 // instruction to guarantee the right WAW order.
2324 // 2) If a destination operand that was used by a recent export/store ins,
2325 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2326
2327 for (const MachineMemOperand *Memop : MI.memoperands()) {
2328 const Value *Ptr = Memop->getValue();
2329 if (Memop->isStore()) {
2330 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2331 addWait(Wait, SmemAccessCounter, 0);
2332 if (PDT->dominates(MI.getParent(), It->second))
2333 SLoadAddresses.erase(It);
2334 }
2335 }
2336 unsigned AS = Memop->getAddrSpace();
2338 continue;
2339 // No need to wait before load from VMEM to LDS.
2340 if (TII->mayWriteLDSThroughDMA(MI))
2341 continue;
2342
2343 // LOAD_CNT is only relevant to vgpr or LDS.
2344 unsigned TID = LDSDMA_BEGIN;
2345 if (Ptr && Memop->getAAInfo()) {
2346 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2347 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2348 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2349 if ((I + 1) >= NUM_LDSDMA) {
2350 // We didn't have enough slot to track this LDS DMA store, it
2351 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2352 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2353 break;
2354 }
2355
2356 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2357 }
2358 }
2359 } else {
2360 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2361 }
2362 if (Memop->isStore()) {
2363 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2364 }
2365 }
2366
2367 // Loop over use and def operands.
2368 for (const MachineOperand &Op : MI.operands()) {
2369 if (!Op.isReg())
2370 continue;
2371
2372 // If the instruction does not read tied source, skip the operand.
2373 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2374 continue;
2375
2376 MCPhysReg Reg = Op.getReg().asMCReg();
2377
2378 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2379 if (IsVGPR) {
2380 // Implicit VGPR defs and uses are never a part of the memory
2381 // instructions description and usually present to account for
2382 // super-register liveness.
2383 // TODO: Most of the other instructions also have implicit uses
2384 // for the liveness accounting only.
2385 if (Op.isImplicit() && MI.mayLoadOrStore())
2386 continue;
2387
2388 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2389 if (Op.isDef())
2390 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2391 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2392 // previous write and this write are the same type of VMEM
2393 // instruction, in which case they are (in some architectures)
2394 // guaranteed to write their results in order anyway.
2395 // Additionally check instructions where Point Sample Acceleration
2396 // might be applied.
2397 if (Op.isUse() || !updateVMCntOnly(MI) ||
2398 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2399 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2400 !ST->hasVmemWriteVgprInOrder()) {
2401 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2402 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2403 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2404 ScoreBrackets.clearVgprVmemTypes(Reg);
2405 }
2406
2407 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2408 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2409 }
2410 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2411 } else if (Op.getReg() == AMDGPU::SCC) {
2412 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2413 } else {
2414 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2415 }
2416
2417 if (ST->hasWaitXCnt() && Op.isDef())
2418 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2419 }
2420 }
2421 }
2422
2423 // Ensure safety against exceptions from outstanding memory operations while
2424 // waiting for a barrier:
2425 //
2426 // * Some subtargets safely handle backing off the barrier in hardware
2427 // when an exception occurs.
2428 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2429 // there can be no outstanding memory operations during the wait.
2430 // * Subtargets with split barriers don't need to back off the barrier; it
2431 // is up to the trap handler to preserve the user barrier state correctly.
2432 //
2433 // In all other cases, ensure safety by ensuring that there are no outstanding
2434 // memory operations.
2435 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2436 !ST->supportsBackOffBarrier()) {
2437 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2438 }
2439
2440 // TODO: Remove this work-around, enable the assert for Bug 457939
2441 // after fixing the scheduler. Also, the Shader Compiler code is
2442 // independent of target.
2443 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2444 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2445 Wait.DsCnt = 0;
2446 }
2447
2448 // Verify that the wait is actually needed.
2449 ScoreBrackets.simplifyWaitcnt(Wait);
2450
2451 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2452 // waits on VA_VDST if the instruction it would precede is not a VALU
2453 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2454 // expert scheduling mode.
2455 if (TII->isVALU(MI))
2456 Wait.VaVdst = ~0u;
2457
2458 // Since the translation for VMEM addresses occur in-order, we can apply the
2459 // XCnt if the current instruction is of VMEM type and has a memory
2460 // dependency with another VMEM instruction in flight.
2461 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2462 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2463 Wait.XCnt = ~0u;
2464 }
2465
2466 // When forcing emit, we need to skip terminators because that would break the
2467 // terminators of the MBB if we emit a waitcnt between terminators.
2468 if (ForceEmitZeroFlag && !MI.isTerminator())
2469 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2470
2471 if (ForceEmitWaitcnt[LOAD_CNT])
2472 Wait.LoadCnt = 0;
2473 if (ForceEmitWaitcnt[EXP_CNT])
2474 Wait.ExpCnt = 0;
2475 if (ForceEmitWaitcnt[DS_CNT])
2476 Wait.DsCnt = 0;
2477 if (ForceEmitWaitcnt[SAMPLE_CNT])
2478 Wait.SampleCnt = 0;
2479 if (ForceEmitWaitcnt[BVH_CNT])
2480 Wait.BvhCnt = 0;
2481 if (ForceEmitWaitcnt[KM_CNT])
2482 Wait.KmCnt = 0;
2483 if (ForceEmitWaitcnt[X_CNT])
2484 Wait.XCnt = 0;
2485 // Only force emit VA_VDST and VM_VSRC if expert mode is enabled.
2486 if (IsExpertMode) {
2487 if (ForceEmitWaitcnt[VA_VDST])
2488 Wait.VaVdst = 0;
2489 if (ForceEmitWaitcnt[VM_VSRC])
2490 Wait.VmVsrc = 0;
2491 }
2492
2493 if (FlushVmCnt) {
2494 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2495 Wait.LoadCnt = 0;
2496 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2497 Wait.SampleCnt = 0;
2498 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2499 Wait.BvhCnt = 0;
2500 }
2501
2502 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2503 Wait.LoadCnt = 0;
2504
2505 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2506 OldWaitcntInstr);
2507}
2508
2509bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2511 MachineBasicBlock &Block,
2512 WaitcntBrackets &ScoreBrackets,
2513 MachineInstr *OldWaitcntInstr) {
2514 bool Modified = false;
2515
2516 if (OldWaitcntInstr)
2517 // Try to merge the required wait with preexisting waitcnt instructions.
2518 // Also erase redundant waitcnt.
2519 Modified =
2520 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2521
2522 AMDGPU::Waitcnt WaitForScore = Wait;
2523
2524 // ExpCnt can be merged into VINTERP.
2525 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2527 MachineOperand *WaitExp =
2528 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2529 if (Wait.ExpCnt < WaitExp->getImm()) {
2530 WaitExp->setImm(Wait.ExpCnt);
2531 Modified = true;
2532 }
2533 Wait.ExpCnt = ~0u;
2534
2535 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2536 << "Update Instr: " << *It);
2537 }
2538
2539 if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
2540 Modified = true;
2541
2542 // Any counts that could have been applied to any existing waitcnt
2543 // instructions will have been done so, now deal with any remaining.
2544 ScoreBrackets.applyWaitcnt(WaitForScore);
2545
2546 return Modified;
2547}
2548
2549std::optional<WaitEventType>
2550SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2551 if (TII->isVALU(Inst)) {
2552 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2553 // out-of-order with respect to each other, so each of these classes
2554 // has its own event.
2555
2556 if (TII->isXDL(Inst))
2557 return VGPR_XDL_WRITE;
2558
2559 if (TII->isTRANS(Inst))
2560 return VGPR_TRANS_WRITE;
2561
2563 return VGPR_DPMACC_WRITE;
2564
2565 return VGPR_CSMACC_WRITE;
2566 }
2567
2568 // FLAT and LDS instructions may read their VGPR sources out-of-order
2569 // with respect to each other and all other VMEM instructions, so
2570 // each of these also has a separate event.
2571
2572 if (TII->isFLAT(Inst))
2573 return VGPR_FLAT_READ;
2574
2575 if (TII->isDS(Inst))
2576 return VGPR_LDS_READ;
2577
2578 if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
2579 return VGPR_VMEM_READ;
2580
2581 // Otherwise, no hazard.
2582
2583 return {};
2584}
2585
2586bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2587 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2588 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2589}
2590
2591// Return true if the next instruction is S_ENDPGM, following fallthrough
2592// blocks if necessary.
2593bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2594 MachineBasicBlock *Block) const {
2595 auto BlockEnd = Block->getParent()->end();
2596 auto BlockIter = Block->getIterator();
2597
2598 while (true) {
2599 if (It.isEnd()) {
2600 if (++BlockIter != BlockEnd) {
2601 It = BlockIter->instr_begin();
2602 continue;
2603 }
2604
2605 return false;
2606 }
2607
2608 if (!It->isMetaInstruction())
2609 break;
2610
2611 It++;
2612 }
2613
2614 assert(!It.isEnd());
2615
2616 return It->getOpcode() == AMDGPU::S_ENDPGM;
2617}
2618
2619// Add a wait after an instruction if architecture requirements mandate one.
2620bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2621 MachineBasicBlock &Block,
2622 WaitcntBrackets &ScoreBrackets) {
2623 AMDGPU::Waitcnt Wait;
2624 bool NeedsEndPGMCheck = false;
2625
2626 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2627 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2629
2630 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2631 Wait.DsCnt = 0;
2632 NeedsEndPGMCheck = true;
2633 }
2634
2635 ScoreBrackets.simplifyWaitcnt(Wait);
2636
2637 auto SuccessorIt = std::next(Inst.getIterator());
2638 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2639 /*OldWaitcntInstr=*/nullptr);
2640
2641 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2642 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2643 .addImm(0);
2644 }
2645
2646 return Result;
2647}
2648
2649void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2650 WaitcntBrackets *ScoreBrackets) {
2651 // Now look at the instruction opcode. If it is a memory access
2652 // instruction, update the upper-bound of the appropriate counter's
2653 // bracket and the destination operand scores.
2654 // For architectures with X_CNT, mark the source address operands
2655 // with the appropriate counter values.
2656 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2657
2658 bool IsVMEMAccess = false;
2659 bool IsSMEMAccess = false;
2660
2661 if (IsExpertMode) {
2662 if (const auto ET = getExpertSchedulingEventType(Inst))
2663 ScoreBrackets->updateByEvent(*ET, Inst);
2664 }
2665
2666 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2667 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2668 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2669 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2670 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2671 ScoreBrackets->setPendingGDS();
2672 } else {
2673 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2674 }
2675 } else if (TII->isFLAT(Inst)) {
2677 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2678 return;
2679 }
2680
2681 assert(Inst.mayLoadOrStore());
2682
2683 int FlatASCount = 0;
2684
2685 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2686 ++FlatASCount;
2687 IsVMEMAccess = true;
2688 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2689 }
2690
2691 if (TII->mayAccessLDSThroughFlat(Inst)) {
2692 ++FlatASCount;
2693 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2694 }
2695
2696 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2697 // pointers. They do have two operands that each access global and LDS, thus
2698 // making it appear at this point that they are using a flat pointer. Filter
2699 // them out, and for the rest, generate a dependency on flat pointers so
2700 // that both VM and LGKM counters are flushed.
2701 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2702 ScoreBrackets->setPendingFlat();
2703 } else if (SIInstrInfo::isVMEM(Inst) &&
2705 IsVMEMAccess = true;
2706 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2707
2708 if (ST->vmemWriteNeedsExpWaitcnt() &&
2709 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2710 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2711 }
2712 } else if (TII->isSMRD(Inst)) {
2713 IsSMEMAccess = true;
2714 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2715 } else if (Inst.isCall()) {
2716 // Act as a wait on everything
2717 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2718 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2719 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2720 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2721 } else if (TII->isVINTERP(Inst)) {
2722 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2723 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2724 } else if (SIInstrInfo::isEXP(Inst)) {
2725 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2727 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2728 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2729 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2730 else
2731 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2732 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2733 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2734 } else {
2735 switch (Inst.getOpcode()) {
2736 case AMDGPU::S_SENDMSG:
2737 case AMDGPU::S_SENDMSG_RTN_B32:
2738 case AMDGPU::S_SENDMSG_RTN_B64:
2739 case AMDGPU::S_SENDMSGHALT:
2740 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2741 break;
2742 case AMDGPU::S_MEMTIME:
2743 case AMDGPU::S_MEMREALTIME:
2744 case AMDGPU::S_GET_BARRIER_STATE_M0:
2745 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2746 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2747 break;
2748 }
2749 }
2750
2751 if (!ST->hasWaitXCnt())
2752 return;
2753
2754 if (IsVMEMAccess)
2755 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2756
2757 if (IsSMEMAccess)
2758 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2759}
2760
2761bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2762 unsigned OtherScore) {
2763 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2764 unsigned OtherShifted =
2765 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2766 Score = std::max(MyShifted, OtherShifted);
2767 return OtherShifted > MyShifted;
2768}
2769
2770/// Merge the pending events and associater score brackets of \p Other into
2771/// this brackets status.
2772///
2773/// Returns whether the merge resulted in a change that requires tighter waits
2774/// (i.e. the merged brackets strictly dominate the original brackets).
2775bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2776 bool StrictDom = false;
2777
2778 // Check if "other" has keys we don't have, and create default entries for
2779 // those. If they remain empty after merging, we will clean it up after.
2780 for (auto K : Other.VMem.keys())
2781 VMem.try_emplace(K);
2782 for (auto K : Other.SGPRs.keys())
2783 SGPRs.try_emplace(K);
2784
2785 for (auto T : inst_counter_types(Context->MaxCounter)) {
2786 // Merge event flags for this counter
2787 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2788 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2789 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2790 if (OtherEvents & ~OldEvents)
2791 StrictDom = true;
2792 PendingEvents |= OtherEvents;
2793
2794 // Merge scores for this counter
2795 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2796 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2797 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2798 if (NewUB < ScoreLBs[T])
2799 report_fatal_error("waitcnt score overflow");
2800
2801 MergeInfo M;
2802 M.OldLB = ScoreLBs[T];
2803 M.OtherLB = Other.ScoreLBs[T];
2804 M.MyShift = NewUB - ScoreUBs[T];
2805 M.OtherShift = NewUB - Other.ScoreUBs[T];
2806
2807 ScoreUBs[T] = NewUB;
2808
2809 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2810
2811 if (T == DS_CNT)
2812 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2813
2814 if (T == KM_CNT) {
2815 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2816 if (Other.hasPendingEvent(SCC_WRITE)) {
2817 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2818 if (!OldEventsHasSCCWrite) {
2819 PendingSCCWrite = Other.PendingSCCWrite;
2820 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2821 PendingSCCWrite = nullptr;
2822 }
2823 }
2824 }
2825
2826 for (auto &[RegID, Info] : VMem)
2827 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2828
2829 if (isSmemCounter(T)) {
2830 unsigned Idx = getSgprScoresIdx(T);
2831 for (auto &[RegID, Info] : SGPRs) {
2832 auto It = Other.SGPRs.find(RegID);
2833 unsigned OtherScore =
2834 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2835 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2836 }
2837 }
2838 }
2839
2840 for (auto &[TID, Info] : VMem) {
2841 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2842 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2843 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2844 Info.VMEMTypes = NewVmemTypes;
2845 }
2846 }
2847
2848 purgeEmptyTrackingData();
2849 return StrictDom;
2850}
2851
2852static bool isWaitInstr(MachineInstr &Inst) {
2853 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2854 return Opcode == AMDGPU::S_WAITCNT ||
2855 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2856 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2857 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2858 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2859 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2860 counterTypeForInstr(Opcode).has_value();
2861}
2862
2863void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2865 bool ExpertMode) const {
2866 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2868 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
2869 .addImm(ExpertMode ? 2 : 0)
2870 .addImm(EncodedReg);
2871}
2872
2873// Generate s_waitcnt instructions where needed.
2874bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2875 MachineBasicBlock &Block,
2876 WaitcntBrackets &ScoreBrackets) {
2877 bool Modified = false;
2878
2879 LLVM_DEBUG({
2880 dbgs() << "*** Begin Block: ";
2881 Block.printName(dbgs());
2882 ScoreBrackets.dump();
2883 });
2884
2885 // Track the correctness of vccz through this basic block. There are two
2886 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2887 // ST->partialVCCWritesUpdateVCCZ().
2888 bool VCCZCorrect = true;
2889 if (ST->hasReadVCCZBug()) {
2890 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2891 // to vcc and then issued an smem load.
2892 VCCZCorrect = false;
2893 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2894 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2895 // to vcc_lo or vcc_hi.
2896 VCCZCorrect = false;
2897 }
2898
2899 // Walk over the instructions.
2900 MachineInstr *OldWaitcntInstr = nullptr;
2901
2902 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2903 E = Block.instr_end();
2904 Iter != E;) {
2905 MachineInstr &Inst = *Iter;
2906 if (Inst.isMetaInstruction()) {
2907 ++Iter;
2908 continue;
2909 }
2910
2911 // Track pre-existing waitcnts that were added in earlier iterations or by
2912 // the memory legalizer.
2913 if (isWaitInstr(Inst) ||
2914 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2915 if (!OldWaitcntInstr)
2916 OldWaitcntInstr = &Inst;
2917 ++Iter;
2918 continue;
2919 }
2920
2921 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2922 isPreheaderToFlush(Block, ScoreBrackets);
2923
2924 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2925 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2926 FlushVmCnt);
2927 OldWaitcntInstr = nullptr;
2928
2929 // Restore vccz if it's not known to be correct already.
2930 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2931
2932 // Don't examine operands unless we need to track vccz correctness.
2933 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2934 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2935 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2936 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2937 if (!ST->partialVCCWritesUpdateVCCZ())
2938 VCCZCorrect = false;
2939 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2940 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2941 // vccz bit, so when we detect that an instruction may read from a
2942 // corrupt vccz bit, we need to:
2943 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2944 // operations to complete.
2945 // 2. Restore the correct value of vccz by writing the current value
2946 // of vcc back to vcc.
2947 if (ST->hasReadVCCZBug() &&
2948 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2949 // Writes to vcc while there's an outstanding smem read may get
2950 // clobbered as soon as any read completes.
2951 VCCZCorrect = false;
2952 } else {
2953 // Writes to vcc will fix any incorrect value in vccz.
2954 VCCZCorrect = true;
2955 }
2956 }
2957 }
2958
2959 if (TII->isSMRD(Inst)) {
2960 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2961 // No need to handle invariant loads when avoiding WAR conflicts, as
2962 // there cannot be a vector store to the same memory location.
2963 if (!Memop->isInvariant()) {
2964 const Value *Ptr = Memop->getValue();
2965 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2966 }
2967 }
2968 if (ST->hasReadVCCZBug()) {
2969 // This smem read could complete and clobber vccz at any time.
2970 VCCZCorrect = false;
2971 }
2972 }
2973
2974 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2975
2976 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2977
2978 LLVM_DEBUG({
2979 Inst.print(dbgs());
2980 ScoreBrackets.dump();
2981 });
2982
2983 // TODO: Remove this work-around after fixing the scheduler and enable the
2984 // assert above.
2985 if (RestoreVCCZ) {
2986 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2987 // bit is updated, so we can restore the bit by reading the value of
2988 // vcc and then writing it back to the register.
2989 BuildMI(Block, Inst, Inst.getDebugLoc(),
2990 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2991 TRI->getVCC())
2992 .addReg(TRI->getVCC());
2993 VCCZCorrect = true;
2994 Modified = true;
2995 }
2996
2997 ++Iter;
2998 }
2999
3000 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
3001 // needed.
3002 AMDGPU::Waitcnt Wait;
3003 if (Block.getFirstTerminator() == Block.end() &&
3004 isPreheaderToFlush(Block, ScoreBrackets)) {
3005 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3006 Wait.LoadCnt = 0;
3007 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3008 Wait.SampleCnt = 0;
3009 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3010 Wait.BvhCnt = 0;
3011 }
3012
3013 // Combine or remove any redundant waitcnts at the end of the block.
3014 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3015 OldWaitcntInstr);
3016
3017 LLVM_DEBUG({
3018 dbgs() << "*** End Block: ";
3019 Block.printName(dbgs());
3020 ScoreBrackets.dump();
3021 });
3022
3023 return Modified;
3024}
3025
3026// Return true if the given machine basic block is a preheader of a loop in
3027// which we want to flush the vmcnt counter, and false otherwise.
3028bool SIInsertWaitcnts::isPreheaderToFlush(
3029 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
3030 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
3031 if (!IsInserted)
3032 return Iterator->second;
3033
3034 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3035 if (!Succ)
3036 return false;
3037
3038 MachineLoop *Loop = MLI->getLoopFor(Succ);
3039 if (!Loop)
3040 return false;
3041
3042 if (Loop->getLoopPreheader() == &MBB &&
3043 shouldFlushVmCnt(Loop, ScoreBrackets)) {
3044 Iterator->second = true;
3045 return true;
3046 }
3047
3048 return false;
3049}
3050
3051bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3053 return TII->mayAccessVMEMThroughFlat(MI);
3054 return SIInstrInfo::isVMEM(MI);
3055}
3056
3057// Return true if it is better to flush the vmcnt counter in the preheader of
3058// the given loop. We currently decide to flush in two situations:
3059// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3060// vgpr containing a value that is loaded outside of the loop. (Only on
3061// targets with no vscnt counter).
3062// 2. The loop contains vmem load(s), but the loaded values are not used in the
3063// loop, and at least one use of a vgpr containing a value that is loaded
3064// outside of the loop.
3065bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
3066 const WaitcntBrackets &Brackets) {
3067 bool HasVMemLoad = false;
3068 bool HasVMemStore = false;
3069 bool UsesVgprLoadedOutside = false;
3070 DenseSet<MCRegUnit> VgprUse;
3071 DenseSet<MCRegUnit> VgprDef;
3072
3073 for (MachineBasicBlock *MBB : ML->blocks()) {
3074 for (MachineInstr &MI : *MBB) {
3075 if (isVMEMOrFlatVMEM(MI)) {
3076 HasVMemLoad |= MI.mayLoad();
3077 HasVMemStore |= MI.mayStore();
3078 }
3079
3080 for (const MachineOperand &Op : MI.all_uses()) {
3081 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
3082 continue;
3083 // Vgpr use
3084 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3085 // If we find a register that is loaded inside the loop, 1. and 2.
3086 // are invalidated and we can exit.
3087 if (VgprDef.contains(RU))
3088 return false;
3089 VgprUse.insert(RU);
3090 // If at least one of Op's registers is in the score brackets, the
3091 // value is likely loaded outside of the loop.
3092 VMEMID ID = toVMEMID(RU);
3093 if (Brackets.getVMemScore(ID, LOAD_CNT) >
3094 Brackets.getScoreLB(LOAD_CNT) ||
3095 Brackets.getVMemScore(ID, SAMPLE_CNT) >
3096 Brackets.getScoreLB(SAMPLE_CNT) ||
3097 Brackets.getVMemScore(ID, BVH_CNT) >
3098 Brackets.getScoreLB(BVH_CNT)) {
3099 UsesVgprLoadedOutside = true;
3100 break;
3101 }
3102 }
3103 }
3104
3105 // VMem load vgpr def
3106 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3107 for (const MachineOperand &Op : MI.all_defs()) {
3108 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3109 // If we find a register that is loaded inside the loop, 1. and 2.
3110 // are invalidated and we can exit.
3111 if (VgprUse.contains(RU))
3112 return false;
3113 VgprDef.insert(RU);
3114 }
3115 }
3116 }
3117 }
3118 }
3119 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
3120 return true;
3121 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
3122}
3123
3124bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3125 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3126 auto *PDT =
3127 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3128 AliasAnalysis *AA = nullptr;
3129 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3130 AA = &AAR->getAAResults();
3131
3132 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3133}
3134
3135PreservedAnalyses
3138 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3139 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3141 .getManager()
3142 .getCachedResult<AAManager>(MF.getFunction());
3143
3144 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
3145 return PreservedAnalyses::all();
3146
3149 .preserve<AAManager>();
3150}
3151
3152bool SIInsertWaitcnts::run(MachineFunction &MF) {
3153 ST = &MF.getSubtarget<GCNSubtarget>();
3154 TII = ST->getInstrInfo();
3155 TRI = &TII->getRegisterInfo();
3156 MRI = &MF.getRegInfo();
3158
3160
3161 // Initialize hardware limits first, as they're needed by the generators.
3162 Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
3163
3164 if (ST->hasExtendedWaitCounts()) {
3165 IsExpertMode = ST->hasExpertSchedulingMode() &&
3166 (ExpertSchedulingModeFlag.getNumOccurrences()
3168 : MF.getFunction()
3169 .getFnAttribute("amdgpu-expert-scheduling-mode")
3170 .getValueAsBool());
3171 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3172 WCGGFX12Plus =
3173 WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode);
3174 WCG = &WCGGFX12Plus;
3175 } else {
3176 MaxCounter = NUM_NORMAL_INST_CNTS;
3177 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
3178 WCG = &WCGPreGFX12;
3179 }
3180
3181 for (auto T : inst_counter_types())
3182 ForceEmitWaitcnt[T] = false;
3183
3184 WaitEventMaskForInst = WCG->getWaitEventMask();
3185
3186 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
3187
3188 BlockInfos.clear();
3189 bool Modified = false;
3190
3191 MachineBasicBlock &EntryBB = MF.front();
3192
3193 if (!MFI->isEntryFunction()) {
3194 // Wait for any outstanding memory operations that the input registers may
3195 // depend on. We can't track them and it's better to do the wait after the
3196 // costly call sequence.
3197
3198 // TODO: Could insert earlier and schedule more liberally with operations
3199 // that only use caller preserved registers.
3201 while (I != EntryBB.end() && I->isMetaInstruction())
3202 ++I;
3203
3204 if (ST->hasExtendedWaitCounts()) {
3205 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3206 .addImm(0);
3207 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3208 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3209 continue;
3210
3211 if (!ST->hasImageInsts() &&
3212 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3213 continue;
3214
3215 BuildMI(EntryBB, I, DebugLoc(),
3216 TII->get(instrsForExtendedCounterTypes[CT]))
3217 .addImm(0);
3218 }
3219 if (IsExpertMode) {
3220 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
3222 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3223 .addImm(Enc);
3224 }
3225 } else {
3226 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
3227 }
3228
3229 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3230 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3231 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3232
3233 Modified = true;
3234 }
3235
3236 // Keep iterating over the blocks in reverse post order, inserting and
3237 // updating s_waitcnt where needed, until a fix point is reached.
3238 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3239 BlockInfos.try_emplace(MBB);
3240
3241 std::unique_ptr<WaitcntBrackets> Brackets;
3242 bool Repeat;
3243 do {
3244 Repeat = false;
3245
3246 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3247 ++BII) {
3248 MachineBasicBlock *MBB = BII->first;
3249 BlockInfo &BI = BII->second;
3250 if (!BI.Dirty)
3251 continue;
3252
3253 if (BI.Incoming) {
3254 if (!Brackets)
3255 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3256 else
3257 *Brackets = *BI.Incoming;
3258 } else {
3259 if (!Brackets)
3260 Brackets = std::make_unique<WaitcntBrackets>(this);
3261 else
3262 *Brackets = WaitcntBrackets(this);
3263 }
3264
3265 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3266 BI.Dirty = false;
3267
3268 if (Brackets->hasPendingEvent()) {
3269 BlockInfo *MoveBracketsToSucc = nullptr;
3270 for (MachineBasicBlock *Succ : MBB->successors()) {
3271 auto *SuccBII = BlockInfos.find(Succ);
3272 BlockInfo &SuccBI = SuccBII->second;
3273 if (!SuccBI.Incoming) {
3274 SuccBI.Dirty = true;
3275 if (SuccBII <= BII) {
3276 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3277 Repeat = true;
3278 }
3279 if (!MoveBracketsToSucc) {
3280 MoveBracketsToSucc = &SuccBI;
3281 } else {
3282 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3283 }
3284 } else if (SuccBI.Incoming->merge(*Brackets)) {
3285 SuccBI.Dirty = true;
3286 if (SuccBII <= BII) {
3287 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3288 Repeat = true;
3289 }
3290 }
3291 }
3292 if (MoveBracketsToSucc)
3293 MoveBracketsToSucc->Incoming = std::move(Brackets);
3294 }
3295 }
3296 } while (Repeat);
3297
3298 if (ST->hasScalarStores()) {
3299 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3300 bool HaveScalarStores = false;
3301
3302 for (MachineBasicBlock &MBB : MF) {
3303 for (MachineInstr &MI : MBB) {
3304 if (!HaveScalarStores && TII->isScalarStore(MI))
3305 HaveScalarStores = true;
3306
3307 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3308 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3309 EndPgmBlocks.push_back(&MBB);
3310 }
3311 }
3312
3313 if (HaveScalarStores) {
3314 // If scalar writes are used, the cache must be flushed or else the next
3315 // wave to reuse the same scratch memory can be clobbered.
3316 //
3317 // Insert s_dcache_wb at wave termination points if there were any scalar
3318 // stores, and only if the cache hasn't already been flushed. This could
3319 // be improved by looking across blocks for flushes in postdominating
3320 // blocks from the stores but an explicitly requested flush is probably
3321 // very rare.
3322 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3323 bool SeenDCacheWB = false;
3324
3325 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3326 I != E; ++I) {
3327 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3328 SeenDCacheWB = true;
3329 else if (TII->isScalarStore(*I))
3330 SeenDCacheWB = false;
3331
3332 // FIXME: It would be better to insert this before a waitcnt if any.
3333 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3334 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3335 !SeenDCacheWB) {
3336 Modified = true;
3337 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3338 }
3339 }
3340 }
3341 }
3342 }
3343
3344 if (IsExpertMode) {
3345 // Enable expert scheduling on function entry. To satisfy ABI requirements
3346 // and to allow calls between function with different expert scheduling
3347 // settings, disable it around calls and before returns.
3348
3350 while (I != EntryBB.end() && I->isMetaInstruction())
3351 ++I;
3352 setSchedulingMode(EntryBB, I, true);
3353
3354 for (MachineInstr *MI : CallInsts) {
3355 MachineBasicBlock &MBB = *MI->getParent();
3356 setSchedulingMode(MBB, MI, false);
3357 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3358 }
3359
3360 for (MachineInstr *MI : ReturnInsts)
3361 setSchedulingMode(*MI->getParent(), MI, false);
3362
3363 Modified = true;
3364 }
3365
3366 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3367 // This is done in different ways depending on how the VGPRs were allocated
3368 // (i.e. whether we're in dynamic VGPR mode or not).
3369 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3370 // waveslot limited kernel runs slower with the deallocation.
3371 if (MFI->isDynamicVGPREnabled()) {
3372 for (MachineInstr *MI : ReleaseVGPRInsts) {
3373 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3374 TII->get(AMDGPU::S_ALLOC_VGPR))
3375 .addImm(0);
3376 Modified = true;
3377 }
3378 } else {
3379 if (!ReleaseVGPRInsts.empty() &&
3380 (MF.getFrameInfo().hasCalls() ||
3381 ST->getOccupancyWithNumVGPRs(
3382 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3383 /*IsDynamicVGPR=*/false) <
3385 for (MachineInstr *MI : ReleaseVGPRInsts) {
3386 if (ST->requiresNopBeforeDeallocVGPRs()) {
3387 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3388 TII->get(AMDGPU::S_NOP))
3389 .addImm(0);
3390 }
3391 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3392 TII->get(AMDGPU::S_SENDMSG))
3394 Modified = true;
3395 }
3396 }
3397 }
3398
3399 CallInsts.clear();
3400 ReturnInsts.clear();
3401 ReleaseVGPRInsts.clear();
3402 PreheadersToFlush.clear();
3403 SLoadAddresses.clear();
3404
3405 return Modified;
3406}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:864
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.