LLVM 23.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30#include "llvm/Support/Debug.h"
32
33using namespace llvm;
34using namespace llvm::AMDGPU;
35
36#define DEBUG_TYPE "si-memory-legalizer"
37#define PASS_NAME "SI Memory Legalizer"
38
40 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
41 cl::desc("Use this to skip inserting cache invalidating instructions."));
42
43namespace {
44
46
47/// Memory operation flags. Can be ORed together.
48enum class SIMemOp {
49 NONE = 0u,
50 LOAD = 1u << 0,
51 STORE = 1u << 1,
52 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
53};
54
55/// Position to insert a new instruction relative to an existing
56/// instruction.
57enum class Position {
58 BEFORE,
59 AFTER
60};
61
62/// The atomic synchronization scopes supported by the AMDGPU target.
63enum class SIAtomicScope {
64 NONE,
65 SINGLETHREAD,
66 WAVEFRONT,
67 WORKGROUP,
68 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
69 AGENT,
70 SYSTEM
71};
72
73/// The distinct address spaces supported by the AMDGPU target for
74/// atomic memory operation. Can be ORed together.
75enum class SIAtomicAddrSpace {
76 NONE = 0u,
77 GLOBAL = 1u << 0,
78 LDS = 1u << 1,
79 SCRATCH = 1u << 2,
80 GDS = 1u << 3,
81 OTHER = 1u << 4,
82
83 /// The address spaces that can be accessed by a FLAT instruction.
84 FLAT = GLOBAL | LDS | SCRATCH,
85
86 /// The address spaces that support atomic instructions.
87 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
88
89 /// All address spaces.
90 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
91
92 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
93};
94
95#ifndef NDEBUG
96static StringRef toString(SIAtomicScope S) {
97 switch (S) {
98 case SIAtomicScope::NONE:
99 return "none";
100 case SIAtomicScope::SINGLETHREAD:
101 return "singlethread";
102 case SIAtomicScope::WAVEFRONT:
103 return "wavefront";
104 case SIAtomicScope::WORKGROUP:
105 return "workgroup";
106 case SIAtomicScope::CLUSTER:
107 return "cluster";
108 case SIAtomicScope::AGENT:
109 return "agent";
110 case SIAtomicScope::SYSTEM:
111 return "system";
112 }
113 llvm_unreachable("unknown atomic scope");
114}
115
116static raw_ostream &operator<<(raw_ostream &OS, SIAtomicAddrSpace AS) {
117 if (AS == SIAtomicAddrSpace::NONE) {
118 OS << "none";
119 return OS;
120 }
121 ListSeparator LS("|");
122 if ((AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE)
123 OS << LS << "global";
124 if ((AS & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE)
125 OS << LS << "lds";
126 if ((AS & SIAtomicAddrSpace::SCRATCH) != SIAtomicAddrSpace::NONE)
127 OS << LS << "scratch";
128 if ((AS & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE)
129 OS << LS << "gds";
130 if ((AS & SIAtomicAddrSpace::OTHER) != SIAtomicAddrSpace::NONE)
131 OS << LS << "other";
132 return OS;
133}
134#endif
135
136class SIMemOpInfo final {
137private:
138
139 friend class SIMemOpAccess;
140
141 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
142 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
143 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
144 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
145 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
146 bool IsCrossAddressSpaceOrdering = false;
147 bool IsVolatile = false;
148 bool IsNonTemporal = false;
149 bool IsLastUse = false;
150 bool IsCooperative = false;
151 bool IsAVNone = false;
152
153 // TODO: Should we assume Cooperative=true if no MMO is present?
154 SIMemOpInfo(
155 const GCNSubtarget &ST,
156 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
157 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
158 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
159 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
160 bool IsCrossAddressSpaceOrdering = true,
161 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
162 bool IsVolatile = false, bool IsNonTemporal = false,
163 bool IsLastUse = false, bool IsCooperative = false,
164 bool CanDemoteWorkgroupToWavefront = false, bool IsAVNone = false)
165 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
166 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
167 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
168 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
169 IsLastUse(IsLastUse), IsCooperative(IsCooperative), IsAVNone(IsAVNone) {
170
171 if (Ordering == AtomicOrdering::NotAtomic) {
172 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
173 assert(Scope == SIAtomicScope::NONE &&
174 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
175 !IsCrossAddressSpaceOrdering &&
176 FailureOrdering == AtomicOrdering::NotAtomic);
177 return;
178 }
179
180 assert(Scope != SIAtomicScope::NONE &&
181 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
182 SIAtomicAddrSpace::NONE &&
183 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
184 SIAtomicAddrSpace::NONE);
185
186 // There is also no cross address space ordering if the ordering
187 // address space is the same as the instruction address space and
188 // only contains a single address space.
189 if ((OrderingAddrSpace == InstrAddrSpace) &&
190 isPowerOf2_32(uint32_t(InstrAddrSpace)))
191 this->IsCrossAddressSpaceOrdering = false;
192
193 // Limit the scope to the maximum supported by the instruction's address
194 // spaces.
195 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
196 SIAtomicAddrSpace::NONE) {
197 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
198 } else if ((InstrAddrSpace &
199 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
200 SIAtomicAddrSpace::NONE) {
201 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
202 } else if ((InstrAddrSpace &
203 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
204 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
205 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
206 }
207
208 // On targets that have no concept of a workgroup cluster, use
209 // AGENT scope as a conservatively correct alternative.
210 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
211 this->Scope = SIAtomicScope::AGENT;
212
213 // When max flat work-group size is at most the wavefront size, the
214 // work-group fits in a single wave, so LLVM workgroup scope matches
215 // wavefront scope. Demote workgroup → wavefront here for fences and for
216 // atomics with ordering stronger than monotonic.
217 if (CanDemoteWorkgroupToWavefront &&
218 this->Scope == SIAtomicScope::WORKGROUP &&
219 (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
220 llvm::isStrongerThan(this->FailureOrdering,
221 AtomicOrdering::Monotonic)))
222 this->Scope = SIAtomicScope::WAVEFRONT;
223 }
224
225public:
226 /// \returns Atomic synchronization scope of the machine instruction used to
227 /// create this SIMemOpInfo.
228 SIAtomicScope getScope() const {
229 return Scope;
230 }
231
232 /// \returns Ordering constraint of the machine instruction used to
233 /// create this SIMemOpInfo.
234 AtomicOrdering getOrdering() const {
235 return Ordering;
236 }
237
238 /// \returns Failure ordering constraint of the machine instruction used to
239 /// create this SIMemOpInfo.
240 AtomicOrdering getFailureOrdering() const {
241 return FailureOrdering;
242 }
243
244 /// \returns The address spaces be accessed by the machine
245 /// instruction used to create this SIMemOpInfo.
246 SIAtomicAddrSpace getInstrAddrSpace() const {
247 return InstrAddrSpace;
248 }
249
250 /// \returns The address spaces that must be ordered by the machine
251 /// instruction used to create this SIMemOpInfo.
252 SIAtomicAddrSpace getOrderingAddrSpace() const {
253 return OrderingAddrSpace;
254 }
255
256 /// \returns Return true iff memory ordering of operations on
257 /// different address spaces is required.
258 bool getIsCrossAddressSpaceOrdering() const {
259 return IsCrossAddressSpaceOrdering;
260 }
261
262 /// \returns True if memory access of the machine instruction used to
263 /// create this SIMemOpInfo is volatile, false otherwise.
264 bool isVolatile() const {
265 return IsVolatile;
266 }
267
268 /// \returns True if memory access of the machine instruction used to
269 /// create this SIMemOpInfo is nontemporal, false otherwise.
270 bool isNonTemporal() const {
271 return IsNonTemporal;
272 }
273
274 /// \returns True if memory access of the machine instruction used to
275 /// create this SIMemOpInfo is last use, false otherwise.
276 bool isLastUse() const { return IsLastUse; }
277
278 /// \returns True if this is a cooperative load or store atomic.
279 bool isCooperative() const { return IsCooperative; }
280
281 /// \returns True if MakeAvailable/MakeVisible should be suppressed.
282 bool isAVNone() const { return IsAVNone; }
283
284 /// \returns True if ordering constraint of the machine instruction used to
285 /// create this SIMemOpInfo is unordered or higher, false otherwise.
286 bool isAtomic() const {
287 return Ordering != AtomicOrdering::NotAtomic;
288 }
289
290};
291
292class SIMemOpAccess final {
293private:
294 const AMDGPUMachineModuleInfo *MMI = nullptr;
295 const GCNSubtarget &ST;
296 const bool CanDemoteWorkgroupToWavefront;
297
298 /// Reports unsupported message \p Msg for \p MI to LLVM context.
299 void reportUnsupported(const MachineBasicBlock::iterator &MI,
300 const char *Msg) const;
301
302 /// Inspects the target synchronization scope \p SSID and determines
303 /// the SI atomic scope it corresponds to, the address spaces it
304 /// covers, and whether the memory ordering applies between address
305 /// spaces.
306 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
307 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
308
309 /// \return Return a bit set of the address spaces accessed by \p AS.
310 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
311
312 /// \returns Info constructed from \p MI, which has at least machine memory
313 /// operand.
314 std::optional<SIMemOpInfo>
315 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
316
317public:
318 /// Construct class to support accessing the machine memory operands
319 /// of instructions in the machine function \p MF.
320 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
321 const Function &F);
322
323 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
324 std::optional<SIMemOpInfo>
326
327 /// \returns Store info if \p MI is a store operation, "std::nullopt"
328 /// otherwise.
329 std::optional<SIMemOpInfo>
330 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
331
332 /// \returns Atomic fence info if \p MI is an atomic fence operation,
333 /// "std::nullopt" otherwise.
334 std::optional<SIMemOpInfo>
335 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
336
337 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
338 /// rmw operation, "std::nullopt" otherwise.
339 std::optional<SIMemOpInfo>
340 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
341
342 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
343 /// along with an indication of whether this is a load or store. If it is not
344 /// a direct-to-LDS operation, returns std::nullopt.
345 std::optional<SIMemOpInfo>
346 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
347};
348
349class SICacheControl {
350protected:
351
352 /// AMDGPU subtarget info.
353 const GCNSubtarget &ST;
354
355 /// Instruction info.
356 const SIInstrInfo *TII = nullptr;
357
358 IsaVersion IV;
359
360 /// Whether to insert cache invalidating instructions.
361 bool InsertCacheInv;
362
363 SICacheControl(const GCNSubtarget &ST);
364
365 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
366 /// \returns Returns true if \p MI is modified, false otherwise.
367 bool enableCPolBits(const MachineBasicBlock::iterator MI,
368 unsigned Bits) const;
369
370 /// Check if any atomic operation on AS can affect memory accessible via the
371 /// global address space.
372 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
373
374public:
375 using CPol = AMDGPU::CPol::CPol;
376
377 /// Create a cache control for the subtarget \p ST.
378 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
379
380 /// Update \p MI memory load instruction to bypass any caches up to
381 /// the \p Scope memory scope for address spaces \p
382 /// AddrSpace. Return true iff the instruction was modified.
383 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace) const = 0;
386
387 /// Update \p MI memory store instruction to bypass any caches up to
388 /// the \p Scope memory scope for address spaces \p
389 /// AddrSpace. Return true iff the instruction was modified.
390 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
391 SIAtomicScope Scope,
392 SIAtomicAddrSpace AddrSpace) const = 0;
393
394 /// Update \p MI memory read-modify-write instruction to bypass any caches up
395 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
396 /// iff the instruction was modified.
397 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
398 SIAtomicScope Scope,
399 SIAtomicAddrSpace AddrSpace) const = 0;
400
401 /// Update \p MI memory instruction of kind \p Op associated with address
402 /// spaces \p AddrSpace to indicate it is volatile and/or
403 /// nontemporal/last-use. Return true iff the instruction was modified.
404 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
405 SIAtomicAddrSpace AddrSpace,
406 SIMemOp Op, bool IsVolatile,
407 bool IsNonTemporal,
408 bool IsLastUse = false) const = 0;
409
410 /// Add final touches to a `mayStore` instruction \p MI, which may be a
411 /// Store or RMW instruction.
412 /// FIXME: This takes a MI because iterators aren't handled properly. When
413 /// this is called, they often point to entirely different insts. Thus we back
414 /// up the inst early and pass it here instead.
415 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
416 return false;
417 };
418
419 /// Add final touches to a `mayLoad` instruction \p MI.
420 virtual bool finalizeLoad(MachineBasicBlock::iterator &MI) const {
421 return false;
422 }
423
424 /// Handle cooperative load/store atomics.
425 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
427 "cooperative atomics are not available on this architecture");
428 }
429
430 /// Inserts any necessary instructions at position \p Pos relative
431 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
432 /// \p Op associated with address spaces \p AddrSpace have completed. Used
433 /// between memory instructions to enforce the order they become visible as
434 /// observed by other memory instructions executing in memory scope \p Scope.
435 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
436 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
437 /// that are used by atomic instructions.
438 /// Returns true iff any instructions inserted.
439 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
440 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441 bool IsCrossAddrSpaceOrdering, Position Pos,
442 AtomicOrdering Order, bool AtomicsOnly) const = 0;
443
444 /// Inserts any necessary instructions at position \p Pos relative to
445 /// instruction \p MI to ensure any subsequent memory instructions of this
446 /// thread with address spaces \p AddrSpace will observe the previous memory
447 /// operations by any thread for memory scopes up to memory scope \p Scope .
448 /// Returns true iff any instructions inserted.
449 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 Position Pos) const = 0;
453
454 /// Inserts any necessary writeback instructions at position \p Pos relative
455 /// to instruction \p MI to make previous memory operations by this thread
456 /// with address spaces \p AddrSpace available to other threads in memory
457 /// scope \p Scope. Does not insert waits; callers must call insertWait
458 /// separately. Returns true iff any instructions inserted.
459 virtual bool insertWriteback(MachineBasicBlock::iterator &MI,
460 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace,
461 Position Pos) const = 0;
462
463 /// Inserts writeback (unless \p IsAVNone) followed by an unconditional wait.
464 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
465 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
466 Position Pos, bool IsAVNone) const {
467 bool Changed = !IsAVNone && insertWriteback(MI, Scope, AddrSpace, Pos);
468 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
469 IsCrossAddrSpaceOrdering, Pos,
470 AtomicOrdering::Release, /*AtomicsOnly=*/false);
471 return Changed;
472 }
473
474 /// Handle operations that are considered non-volatile.
475 /// See \ref isNonVolatileMemoryAccess
476 virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
477
478 /// Virtual destructor to allow derivations to be deleted.
479 virtual ~SICacheControl() = default;
480};
481
482/// Generates code sequences for the memory model of all GFX targets below
483/// GFX10.
484class SIGfx6CacheControl final : public SICacheControl {
485public:
486
487 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
488
489 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
490 SIAtomicScope Scope,
491 SIAtomicAddrSpace AddrSpace) const override;
492
493 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
494 SIAtomicScope Scope,
495 SIAtomicAddrSpace AddrSpace) const override;
496
497 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace) const override;
500
501 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
502 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
503 bool IsVolatile, bool IsNonTemporal,
504 bool IsLastUse) const override;
505
506 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
507 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
508 bool IsCrossAddrSpaceOrdering, Position Pos,
509 AtomicOrdering Order, bool AtomicsOnly) const override;
510
511 bool insertAcquire(MachineBasicBlock::iterator &MI,
512 SIAtomicScope Scope,
513 SIAtomicAddrSpace AddrSpace,
514 Position Pos) const override;
515
516 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
517 SIAtomicAddrSpace AddrSpace,
518 Position Pos) const override;
519};
520
521/// Generates code sequences for the memory model of GFX10/11.
522class SIGfx10CacheControl final : public SICacheControl {
523public:
524 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
525
526 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
527 SIAtomicScope Scope,
528 SIAtomicAddrSpace AddrSpace) const override;
529
530 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
531 SIAtomicScope Scope,
532 SIAtomicAddrSpace AddrSpace) const override {
533 return false;
534 }
535
536 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
537 SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace) const override {
539 return false;
540 }
541
542 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
543 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
544 bool IsVolatile, bool IsNonTemporal,
545 bool IsLastUse) const override;
546
547 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
548 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
549 bool IsCrossAddrSpaceOrdering, Position Pos,
550 AtomicOrdering Order, bool AtomicsOnly) const override;
551
552 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
553 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
554
555 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
556 SIAtomicAddrSpace AddrSpace,
557 Position Pos) const override {
558 return false;
559 }
560};
561
562class SIGfx12CacheControl final : public SICacheControl {
563protected:
564 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
565 // \returns Returns true if \p MI is modified, false otherwise.
566 bool setTH(const MachineBasicBlock::iterator MI,
568
569 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
570 // MI. \returns Returns true if \p MI is modified, false otherwise.
571 bool setScope(const MachineBasicBlock::iterator MI,
573
574 // Stores with system scope (SCOPE_SYS) need to wait for:
575 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
576 // - non-returning-atomics - wait for STORECNT==0
577 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
578 // since it does not distinguish atomics-with-return from regular stores.
579 // There is no need to wait if memory is cached (mtype != UC).
580 bool
581 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
582
583 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
584 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
585
586public:
587 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
588 // GFX120x and GFX125x memory models greatly overlap, and in some cases
589 // the behavior is the same if assuming GFX120x in CU mode.
590 assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
591 }
592
593 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
594 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
595 bool IsCrossAddrSpaceOrdering, Position Pos,
596 AtomicOrdering Order, bool AtomicsOnly) const override;
597
598 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
599 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
600
601 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
602 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
603 bool IsVolatile, bool IsNonTemporal,
604 bool IsLastUse) const override;
605
606 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
607
608 bool finalizeLoad(MachineBasicBlock::iterator &MI) const override;
609
610 bool handleCooperativeAtomic(MachineInstr &MI) const override;
611
612 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
613 SIAtomicAddrSpace AddrSpace,
614 Position Pos) const override;
615
616 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
617 SIAtomicScope Scope,
618 SIAtomicAddrSpace AddrSpace) const override {
619 return setAtomicScope(MI, Scope, AddrSpace);
620 }
621
622 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
623 SIAtomicScope Scope,
624 SIAtomicAddrSpace AddrSpace) const override {
625 return setAtomicScope(MI, Scope, AddrSpace);
626 }
627
628 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
629 SIAtomicScope Scope,
630 SIAtomicAddrSpace AddrSpace) const override {
631 return setAtomicScope(MI, Scope, AddrSpace);
632 }
633
634 bool handleNonVolatile(MachineInstr &MI) const override;
635};
636
637class SIMemoryLegalizer final {
638private:
639 const MachineModuleInfo &MMI;
640 /// Cache Control.
641 std::unique_ptr<SICacheControl> CC = nullptr;
642
643 /// List of atomic pseudo instructions.
644 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
645
646 /// Return true iff instruction \p MI is a atomic instruction that
647 /// returns a result.
648 bool isAtomicRet(const MachineInstr &MI) const {
650 }
651
652 /// Removes all processed atomic pseudo instructions from the current
653 /// function. Returns true if current function is modified, false otherwise.
654 bool removeAtomicPseudoMIs();
655
656 /// Expands load operation \p MI. Returns true if instructions are
657 /// added/deleted or \p MI is modified, false otherwise.
658 bool expandLoad(const SIMemOpInfo &MOI,
660 /// Expands store operation \p MI. Returns true if instructions are
661 /// added/deleted or \p MI is modified, false otherwise.
662 bool expandStore(const SIMemOpInfo &MOI,
664 /// Expands atomic fence operation \p MI. Returns true if
665 /// instructions are added/deleted or \p MI is modified, false otherwise.
666 bool expandAtomicFence(const SIMemOpInfo &MOI,
668 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
669 /// instructions are added/deleted or \p MI is modified, false otherwise.
670 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
672 /// Expands LDS DMA operation \p MI. Returns true if instructions are
673 /// added/deleted or \p MI is modified, false otherwise.
674 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
675
676public:
677 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
678 bool run(MachineFunction &MF);
679};
680
681class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
682public:
683 static char ID;
684
685 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
686
687 void getAnalysisUsage(AnalysisUsage &AU) const override {
688 AU.setPreservesCFG();
690 }
691
692 StringRef getPassName() const override {
693 return PASS_NAME;
694 }
695
696 bool runOnMachineFunction(MachineFunction &MF) override;
697};
698
699static const StringMap<SIAtomicAddrSpace> ASNames = {{
700 {"global", SIAtomicAddrSpace::GLOBAL},
701 {"local", SIAtomicAddrSpace::LDS},
702}};
703
704void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
705 const MachineFunction *MF = MI.getMF();
706 const Function &Fn = MF->getFunction();
708 raw_svector_ostream OS(Str);
709 OS << "unknown address space '" << AS << "'; expected one of ";
711 for (const auto &[Name, Val] : ASNames)
712 OS << LS << '\'' << Name << '\'';
713 Fn.getContext().diagnose(
714 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
715}
716
717/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
718/// If this tag isn't present, or if it has no meaningful values, returns
719/// \p none, otherwise returns the address spaces specified by the MD.
720static std::optional<SIAtomicAddrSpace>
721getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
722 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
723
724 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
725 if (!MMRA)
726 return std::nullopt;
727
728 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
729 for (const auto &[Prefix, Suffix] : MMRA) {
730 if (Prefix != FenceASPrefix)
731 continue;
732
733 if (auto It = ASNames.find(Suffix); It != ASNames.end())
734 Result |= It->second;
735 else
736 diagnoseUnknownMMRAASName(MI, Suffix);
737 }
738
739 if (Result == SIAtomicAddrSpace::NONE)
740 return std::nullopt;
741
742 return Result;
743}
744
745static void diagnoseUnknownAVMetadata(const MachineInstr &MI,
746 StringRef Suffix) {
747 const MachineFunction *MF = MI.getMF();
748 const Function &Fn = MF->getFunction();
750 Fn, Twine("unknown amdgcn-av metadata '") + Suffix + Twine('\''),
751 MI.getDebugLoc(), DS_Warning));
752}
753
754static bool hasAVNoneMMRA(const MachineInstr &MI) {
755 MMRAMetadata MMRA(MI.getMMRAMetadata());
756 if (!MMRA)
757 return false;
758 bool TagFound = false;
759 for (const auto &[Prefix, Suffix] : MMRA) {
760 if (Prefix != "amdgcn-av")
761 continue;
762 if (Suffix == "none")
763 TagFound = true;
764 else
765 diagnoseUnknownAVMetadata(MI, Suffix);
766 }
767 return TagFound;
768}
769
770} // end anonymous namespace
771
772void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
773 const char *Msg) const {
774 const Function &Func = MI->getMF()->getFunction();
775 Func.getContext().diagnose(
776 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
777}
778
779std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
780SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
781 SIAtomicAddrSpace InstrAddrSpace) const {
782 if (SSID == SyncScope::System)
783 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
784 if (SSID == MMI->getAgentSSID())
785 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
786 if (SSID == MMI->getClusterSSID())
787 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
788 if (SSID == MMI->getWorkgroupSSID())
789 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
790 true);
791 if (SSID == MMI->getWavefrontSSID())
792 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
793 true);
794 if (SSID == SyncScope::SingleThread)
795 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
796 true);
797 if (SSID == MMI->getSystemOneAddressSpaceSSID())
798 return std::tuple(SIAtomicScope::SYSTEM,
799 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
800 if (SSID == MMI->getAgentOneAddressSpaceSSID())
801 return std::tuple(SIAtomicScope::AGENT,
802 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
803 if (SSID == MMI->getClusterOneAddressSpaceSSID())
804 return std::tuple(SIAtomicScope::CLUSTER,
805 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
806 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
807 return std::tuple(SIAtomicScope::WORKGROUP,
808 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
809 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
810 return std::tuple(SIAtomicScope::WAVEFRONT,
811 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
812 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
813 return std::tuple(SIAtomicScope::SINGLETHREAD,
814 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
815 return std::nullopt;
816}
817
818SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
819 if (AS == AMDGPUAS::FLAT_ADDRESS)
820 return SIAtomicAddrSpace::FLAT;
821 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
822 return SIAtomicAddrSpace::GLOBAL;
823 if (AS == AMDGPUAS::LOCAL_ADDRESS)
824 return SIAtomicAddrSpace::LDS;
826 return SIAtomicAddrSpace::SCRATCH;
827 if (AS == AMDGPUAS::REGION_ADDRESS)
828 return SIAtomicAddrSpace::GDS;
831 return SIAtomicAddrSpace::GLOBAL;
832
833 return SIAtomicAddrSpace::OTHER;
834}
835
836// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an
837// IR pass (and extending it to other scoped operations), so middle-end
838// optimizations see wavefront scope earlier.
839SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
840 const GCNSubtarget &ST, const Function &F)
841 : MMI(&MMI_), ST(ST),
842 CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
843
844std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
845 const MachineBasicBlock::iterator &MI) const {
846 assert(MI->getNumMemOperands() > 0);
847
849 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
850 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
851 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
852 bool IsNonTemporal = true;
853 bool IsVolatile = false;
854 bool IsLastUse = false;
855 bool IsCooperative = false;
856
857 // Validator should check whether or not MMOs cover the entire set of
858 // locations accessed by the memory instruction.
859 for (const auto &MMO : MI->memoperands()) {
860 IsNonTemporal &= MMO->isNonTemporal();
861 IsVolatile |= MMO->isVolatile();
862 IsLastUse |= MMO->getFlags() & MOLastUse;
863 IsCooperative |= MMO->getFlags() & MOCooperative;
864 InstrAddrSpace |=
865 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
866 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
867 if (OpOrdering != AtomicOrdering::NotAtomic) {
868 const auto &IsSyncScopeInclusion =
869 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
870 if (!IsSyncScopeInclusion) {
871 reportUnsupported(MI,
872 "Unsupported non-inclusive atomic synchronization scope");
873 return std::nullopt;
874 }
875
876 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
877 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
878 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
879 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
880 FailureOrdering =
881 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
882 }
883 }
884
885 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
886 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
887 // here, but the lowering should really be cleaned up at some point.
888 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
889 SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
890 Ordering = AtomicOrdering::Monotonic;
891
892 SIAtomicScope Scope = SIAtomicScope::NONE;
893 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
894 bool IsCrossAddressSpaceOrdering = false;
895 if (Ordering != AtomicOrdering::NotAtomic) {
896 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
897 if (!ScopeOrNone) {
898 reportUnsupported(MI, "Unsupported atomic synchronization scope");
899 return std::nullopt;
900 }
901 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
902 *ScopeOrNone;
903 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
904 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
905 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
906 reportUnsupported(MI, "Unsupported atomic address space");
907 return std::nullopt;
908 }
909 }
910 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
911 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
912 IsNonTemporal, IsLastUse, IsCooperative,
913 CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI));
914}
915
916std::optional<SIMemOpInfo>
917SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
918 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
919
920 if (!(MI->mayLoad() && !MI->mayStore()))
921 return std::nullopt;
922
923 // Be conservative if there are no memory operands.
924 if (MI->getNumMemOperands() == 0)
925 return SIMemOpInfo(ST);
926
927 return constructFromMIWithMMO(MI);
928}
929
930std::optional<SIMemOpInfo>
931SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
932 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
933
934 if (!(!MI->mayLoad() && MI->mayStore()))
935 return std::nullopt;
936
937 // Be conservative if there are no memory operands.
938 if (MI->getNumMemOperands() == 0)
939 return SIMemOpInfo(ST);
940
941 return constructFromMIWithMMO(MI);
942}
943
944std::optional<SIMemOpInfo>
945SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
946 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
947
948 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
949 return std::nullopt;
950
952 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
953
954 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
955 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
956 if (!ScopeOrNone) {
957 reportUnsupported(MI, "Unsupported atomic synchronization scope");
958 return std::nullopt;
959 }
960
961 SIAtomicScope Scope = SIAtomicScope::NONE;
962 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
963 bool IsCrossAddressSpaceOrdering = false;
964 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
965 *ScopeOrNone;
966
967 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
968 // We currently expect refineOrderingAS to be the only place that
969 // can refine the AS ordered by the fence.
970 // If that changes, we need to review the semantics of that function
971 // in case it needs to preserve certain address spaces.
972 reportUnsupported(MI, "Unsupported atomic address space");
973 return std::nullopt;
974 }
975
976 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
977 if (SynchronizeAS)
978 OrderingAddrSpace = *SynchronizeAS;
979
980 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
981 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
982 AtomicOrdering::NotAtomic, false, false, false, false,
983 CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI));
984}
985
986std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
987 const MachineBasicBlock::iterator &MI) const {
988 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
989
990 if (!(MI->mayLoad() && MI->mayStore()))
991 return std::nullopt;
992
993 // Be conservative if there are no memory operands.
994 if (MI->getNumMemOperands() == 0)
995 return SIMemOpInfo(ST);
996
997 return constructFromMIWithMMO(MI);
998}
999
1000std::optional<SIMemOpInfo>
1001SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
1002 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
1003
1005 return std::nullopt;
1006
1007 return constructFromMIWithMMO(MI);
1008}
1009
1010/// \returns true if \p MI has one or more MMO, and all of them are fit for
1011/// being marked as non-volatile. This means that either they are accessing the
1012/// constant address space, are accessing a known invariant memory location, or
1013/// that they are marked with the non-volatile metadata/MMO flag.
1015 if (MI.getNumMemOperands() == 0)
1016 return false;
1017 return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
1018 return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
1019 });
1020}
1021
1022SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
1023 TII = ST.getInstrInfo();
1024 IV = getIsaVersion(ST.getCPU());
1025 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
1026}
1027
1028bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
1029 unsigned Bits) const {
1030 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
1031 if (!CPol)
1032 return false;
1033
1034 CPol->setImm(CPol->getImm() | Bits);
1035 return true;
1036}
1037
1038bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
1039 assert((!ST.hasGloballyAddressableScratch() ||
1040 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
1041 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
1042 "scratch instructions should already be replaced by flat "
1043 "instructions if GloballyAddressableScratch is enabled");
1044 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
1045}
1046
1047/* static */
1048std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1049 GCNSubtarget::Generation Generation = ST.getGeneration();
1050 if (Generation < AMDGPUSubtarget::GFX10)
1051 return std::make_unique<SIGfx6CacheControl>(ST);
1052 if (Generation < AMDGPUSubtarget::GFX12)
1053 return std::make_unique<SIGfx10CacheControl>(ST);
1054 return std::make_unique<SIGfx12CacheControl>(ST);
1055}
1056
1057bool SIGfx6CacheControl::enableLoadCacheBypass(
1059 SIAtomicScope Scope,
1060 SIAtomicAddrSpace AddrSpace) const {
1061 assert(MI->mayLoad() && !MI->mayStore());
1062
1063 if (!canAffectGlobalAddrSpace(AddrSpace)) {
1064 /// The scratch address space does not need the global memory caches
1065 /// to be bypassed as all memory operations by the same thread are
1066 /// sequentially consistent, and no other thread can access scratch
1067 /// memory.
1068
1069 /// Other address spaces do not have a cache.
1070 return false;
1071 }
1072
1073 bool Changed = false;
1074 switch (Scope) {
1075 case SIAtomicScope::SYSTEM:
1076 if (ST.hasGFX940Insts()) {
1077 // Set SC bits to indicate system scope.
1078 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1079 break;
1080 }
1081 [[fallthrough]];
1082 case SIAtomicScope::AGENT:
1083 if (ST.hasGFX940Insts()) {
1084 // Set SC bits to indicate agent scope.
1085 Changed |= enableCPolBits(MI, CPol::SC1);
1086 } else {
1087 // Set L1 cache policy to MISS_EVICT.
1088 // Note: there is no L2 cache bypass policy at the ISA level.
1089 Changed |= enableCPolBits(MI, CPol::GLC);
1090 }
1091 break;
1092 case SIAtomicScope::WORKGROUP:
1093 if (ST.hasGFX940Insts()) {
1094 // In threadgroup split mode the waves of a work-group can be executing
1095 // on different CUs. Therefore need to bypass the L1 which is per CU.
1096 // Otherwise in non-threadgroup split mode all waves of a work-group are
1097 // on the same CU, and so the L1 does not need to be bypassed. Setting
1098 // SC bits to indicate work-group scope will do this automatically.
1099 Changed |= enableCPolBits(MI, CPol::SC0);
1100 } else if (ST.hasGFX90AInsts()) {
1101 // In threadgroup split mode the waves of a work-group can be executing
1102 // on different CUs. Therefore need to bypass the L1 which is per CU.
1103 // Otherwise in non-threadgroup split mode all waves of a work-group are
1104 // on the same CU, and so the L1 does not need to be bypassed.
1105 if (ST.isTgSplitEnabled())
1106 Changed |= enableCPolBits(MI, CPol::GLC);
1107 }
1108 break;
1109 case SIAtomicScope::WAVEFRONT:
1110 case SIAtomicScope::SINGLETHREAD:
1111 // No cache to bypass.
1112 break;
1113 default:
1114 llvm_unreachable("Unsupported synchronization scope");
1115 }
1116
1117 return Changed;
1118}
1119
1120bool SIGfx6CacheControl::enableStoreCacheBypass(
1122 SIAtomicScope Scope,
1123 SIAtomicAddrSpace AddrSpace) const {
1124 assert(!MI->mayLoad() && MI->mayStore());
1125 bool Changed = false;
1126
1127 /// For targets other than GFX940, the L1 cache is write through so does not
1128 /// need to be bypassed. There is no bypass control for the L2 cache at the
1129 /// isa level.
1130
1131 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1132 switch (Scope) {
1133 case SIAtomicScope::SYSTEM:
1134 // Set SC bits to indicate system scope.
1135 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1136 break;
1137 case SIAtomicScope::AGENT:
1138 // Set SC bits to indicate agent scope.
1139 Changed |= enableCPolBits(MI, CPol::SC1);
1140 break;
1141 case SIAtomicScope::WORKGROUP:
1142 // Set SC bits to indicate workgroup scope.
1143 Changed |= enableCPolBits(MI, CPol::SC0);
1144 break;
1145 case SIAtomicScope::WAVEFRONT:
1146 case SIAtomicScope::SINGLETHREAD:
1147 // Leave SC bits unset to indicate wavefront scope.
1148 break;
1149 default:
1150 llvm_unreachable("Unsupported synchronization scope");
1151 }
1152
1153 /// The scratch address space does not need the global memory caches
1154 /// to be bypassed as all memory operations by the same thread are
1155 /// sequentially consistent, and no other thread can access scratch
1156 /// memory.
1157
1158 /// Other address spaces do not have a cache.
1159 }
1160
1161 return Changed;
1162}
1163
1164bool SIGfx6CacheControl::enableRMWCacheBypass(
1166 SIAtomicScope Scope,
1167 SIAtomicAddrSpace AddrSpace) const {
1168 assert(MI->mayLoad() && MI->mayStore());
1169 bool Changed = false;
1170
1171 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1172 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1173 /// indicate if they are return or no-return. Note: there is no L2 cache
1174 /// coherent bypass control at the ISA level.
1175 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1176
1177 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1178 switch (Scope) {
1179 case SIAtomicScope::SYSTEM:
1180 // Set SC1 bit to indicate system scope.
1181 Changed |= enableCPolBits(MI, CPol::SC1);
1182 break;
1183 case SIAtomicScope::AGENT:
1184 case SIAtomicScope::WORKGROUP:
1185 case SIAtomicScope::WAVEFRONT:
1186 case SIAtomicScope::SINGLETHREAD:
1187 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1188 // to indicate system or agent scope. The SC0 bit is used to indicate if
1189 // they are return or no-return. Leave SC1 bit unset to indicate agent
1190 // scope.
1191 break;
1192 default:
1193 llvm_unreachable("Unsupported synchronization scope");
1194 }
1195 }
1196
1197 return Changed;
1198}
1199
1200bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1201 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1202 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1203 // Only handle load and store, not atomic read-modify-write insructions. The
1204 // latter use glc to indicate if the atomic returns a result and so must not
1205 // be used for cache control.
1206 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1207
1208 // Only update load and store, not LLVM IR atomic read-modify-write
1209 // instructions. The latter are always marked as volatile so cannot sensibly
1210 // handle it as do not want to pessimize all atomics. Also they do not support
1211 // the nontemporal attribute.
1212 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1213
1214 bool Changed = false;
1215
1216 if (IsVolatile) {
1217 if (ST.hasGFX940Insts()) {
1218 // Set SC bits to indicate system scope.
1219 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1220 } else if (Op == SIMemOp::LOAD) {
1221 // Set L1 cache policy to be MISS_EVICT for load instructions
1222 // and MISS_LRU for store instructions.
1223 // Note: there is no L2 cache bypass policy at the ISA level.
1224 Changed |= enableCPolBits(MI, CPol::GLC);
1225 }
1226
1227 // Ensure operation has completed at system scope to cause all volatile
1228 // operations to be visible outside the program in a global order. Do not
1229 // request cross address space as only the global address space can be
1230 // observable outside the program, so no need to cause a waitcnt for LDS
1231 // address space operations.
1232 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1233 Position::AFTER, AtomicOrdering::Unordered,
1234 /*AtomicsOnly=*/false);
1235
1236 return Changed;
1237 }
1238
1239 if (IsNonTemporal) {
1240 if (ST.hasGFX940Insts()) {
1241 Changed |= enableCPolBits(MI, CPol::NT);
1242 } else {
1243 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1244 // for both loads and stores, and the L2 cache policy to STREAM.
1245 Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
1246 }
1247 return Changed;
1248 }
1249
1250 return Changed;
1251}
1252
1253bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1254 SIAtomicScope Scope,
1255 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1256 bool IsCrossAddrSpaceOrdering, Position Pos,
1257 AtomicOrdering Order,
1258 bool AtomicsOnly) const {
1259 bool Changed = false;
1260
1261 MachineBasicBlock &MBB = *MI->getParent();
1262 const DebugLoc &DL = MI->getDebugLoc();
1263
1264 if (Pos == Position::AFTER)
1265 ++MI;
1266
1267 // GFX90A+
1268 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1269 // In threadgroup split mode the waves of a work-group can be executing on
1270 // different CUs. Therefore need to wait for global or GDS memory operations
1271 // to complete to ensure they are visible to waves in the other CUs.
1272 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1273 // the same CU, so no need to wait for global memory as all waves in the
1274 // work-group access the same the L1, nor wait for GDS as access are ordered
1275 // on a CU.
1276 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1277 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1278 (Scope == SIAtomicScope::WORKGROUP)) {
1279 // Same as <GFX90A at AGENT scope;
1280 Scope = SIAtomicScope::AGENT;
1281 }
1282 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1283 // LDS memory operations.
1284 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1285 }
1286
1287 bool VMCnt = false;
1288 bool LGKMCnt = false;
1289
1290 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1291 SIAtomicAddrSpace::NONE) {
1292 switch (Scope) {
1293 case SIAtomicScope::SYSTEM:
1294 case SIAtomicScope::AGENT:
1295 VMCnt |= true;
1296 break;
1297 case SIAtomicScope::WORKGROUP:
1298 case SIAtomicScope::WAVEFRONT:
1299 case SIAtomicScope::SINGLETHREAD:
1300 // The L1 cache keeps all memory operations in order for
1301 // wavefronts in the same work-group.
1302 break;
1303 default:
1304 llvm_unreachable("Unsupported synchronization scope");
1305 }
1306 }
1307
1308 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1309 switch (Scope) {
1310 case SIAtomicScope::SYSTEM:
1311 case SIAtomicScope::AGENT:
1312 case SIAtomicScope::WORKGROUP:
1313 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1314 // not needed as LDS operations for all waves are executed in a total
1315 // global ordering as observed by all waves. Required if also
1316 // synchronizing with global/GDS memory as LDS operations could be
1317 // reordered with respect to later global/GDS memory operations of the
1318 // same wave.
1319 LGKMCnt |= IsCrossAddrSpaceOrdering;
1320 break;
1321 case SIAtomicScope::WAVEFRONT:
1322 case SIAtomicScope::SINGLETHREAD:
1323 // The LDS keeps all memory operations in order for
1324 // the same wavefront.
1325 break;
1326 default:
1327 llvm_unreachable("Unsupported synchronization scope");
1328 }
1329 }
1330
1331 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1332 switch (Scope) {
1333 case SIAtomicScope::SYSTEM:
1334 case SIAtomicScope::AGENT:
1335 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1336 // is not needed as GDS operations for all waves are executed in a total
1337 // global ordering as observed by all waves. Required if also
1338 // synchronizing with global/LDS memory as GDS operations could be
1339 // reordered with respect to later global/LDS memory operations of the
1340 // same wave.
1341 LGKMCnt |= IsCrossAddrSpaceOrdering;
1342 break;
1343 case SIAtomicScope::WORKGROUP:
1344 case SIAtomicScope::WAVEFRONT:
1345 case SIAtomicScope::SINGLETHREAD:
1346 // The GDS keeps all memory operations in order for
1347 // the same work-group.
1348 break;
1349 default:
1350 llvm_unreachable("Unsupported synchronization scope");
1351 }
1352 }
1353
1354 if (VMCnt || LGKMCnt) {
1355 unsigned WaitCntImmediate =
1357 VMCnt ? 0 : getVmcntBitMask(IV),
1359 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1360 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1361 .addImm(WaitCntImmediate);
1362 Changed = true;
1363 }
1364
1365 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1366 // at workgroup-scoped release operations that specify the LDS address space.
1367 // SIInsertWaitcnts will later replace this with a vmcnt().
1368 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1369 Scope == SIAtomicScope::WORKGROUP &&
1370 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1371 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1372 Changed = true;
1373 }
1374
1375 if (Pos == Position::AFTER)
1376 --MI;
1377
1378 return Changed;
1379}
1380
1382 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1383 return false;
1384 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1385}
1386
1387bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1388 SIAtomicScope Scope,
1389 SIAtomicAddrSpace AddrSpace,
1390 Position Pos) const {
1391 if (!InsertCacheInv)
1392 return false;
1393
1394 bool Changed = false;
1395
1396 MachineBasicBlock &MBB = *MI->getParent();
1397 const DebugLoc &DL = MI->getDebugLoc();
1398
1399 if (Pos == Position::AFTER)
1400 ++MI;
1401
1402 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1403 ? AMDGPU::BUFFER_WBINVL1_VOL
1404 : AMDGPU::BUFFER_WBINVL1;
1405
1406 if (canAffectGlobalAddrSpace(AddrSpace)) {
1407 switch (Scope) {
1408 case SIAtomicScope::SYSTEM:
1409 if (ST.hasGFX940Insts()) {
1410 // Ensures that following loads will not see stale remote VMEM data or
1411 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1412 // and CC will never be stale due to the local memory probes.
1413 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1414 // Set SC bits to indicate system scope.
1416 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1417 // hardware does not reorder memory operations by the same wave with
1418 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1419 // remove any cache lines of earlier writes by the same wave and ensures
1420 // later reads by the same wave will refetch the cache lines.
1421 Changed = true;
1422 break;
1423 }
1424
1425 if (ST.hasGFX90AInsts()) {
1426 // Ensures that following loads will not see stale remote VMEM data or
1427 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1428 // and CC will never be stale due to the local memory probes.
1429 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1430 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1431 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1432 // hardware does not reorder memory operations by the same wave with
1433 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1434 // to remove any cache lines of earlier writes by the same wave and
1435 // ensures later reads by the same wave will refetch the cache lines.
1436 Changed = true;
1437 break;
1438 }
1439 [[fallthrough]];
1440 case SIAtomicScope::AGENT:
1441 if (ST.hasGFX940Insts()) {
1442 // Ensures that following loads will not see stale remote date or local
1443 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1444 // stale due to the memory probes.
1445 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1446 // Set SC bits to indicate agent scope.
1448 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1449 // does not reorder memory operations with respect to preceeding buffer
1450 // invalidate. The invalidate is guaranteed to remove any cache lines of
1451 // earlier writes and ensures later writes will refetch the cache lines.
1452 } else
1453 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1454 Changed = true;
1455 break;
1456 case SIAtomicScope::WORKGROUP:
1457 if (ST.isTgSplitEnabled()) {
1458 if (ST.hasGFX940Insts()) {
1459 // In threadgroup split mode the waves of a work-group can be
1460 // executing on different CUs. Therefore need to invalidate the L1
1461 // which is per CU. Otherwise in non-threadgroup split mode all waves
1462 // of a work-group are on the same CU, and so the L1 does not need to
1463 // be invalidated.
1464
1465 // Ensures L1 is invalidated if in threadgroup split mode. In
1466 // non-threadgroup split mode it is a NOP, but no point generating it
1467 // in that case if know not in that mode.
1468 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1469 // Set SC bits to indicate work-group scope.
1471 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1472 // does not reorder memory operations with respect to preceeding
1473 // buffer invalidate. The invalidate is guaranteed to remove any cache
1474 // lines of earlier writes and ensures later writes will refetch the
1475 // cache lines.
1476 Changed = true;
1477 } else if (ST.hasGFX90AInsts()) {
1478 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1479 Changed = true;
1480 }
1481 }
1482 break;
1483 case SIAtomicScope::WAVEFRONT:
1484 case SIAtomicScope::SINGLETHREAD:
1485 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1486 // there are no caches to invalidate. All other targets have no cache to
1487 // invalidate.
1488 break;
1489 default:
1490 llvm_unreachable("Unsupported synchronization scope");
1491 }
1492 }
1493
1494 /// The scratch address space does not need the global memory cache
1495 /// to be flushed as all memory operations by the same thread are
1496 /// sequentially consistent, and no other thread can access scratch
1497 /// memory.
1498
1499 /// Other address spaces do not have a cache.
1500
1501 if (Pos == Position::AFTER)
1502 --MI;
1503
1504 return Changed;
1505}
1506
1507bool SIGfx6CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
1508 SIAtomicScope Scope,
1509 SIAtomicAddrSpace AddrSpace,
1510 Position Pos) const {
1511 if (!ST.hasGFX90AInsts())
1512 return false;
1513
1514 bool Changed = false;
1515 MachineBasicBlock &MBB = *MI->getParent();
1516 const DebugLoc &DL = MI->getDebugLoc();
1517
1518 if (Pos == Position::AFTER)
1519 ++MI;
1520
1521 if (canAffectGlobalAddrSpace(AddrSpace)) {
1522 switch (Scope) {
1523 case SIAtomicScope::SYSTEM:
1524 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1525 // hardware does not reorder memory operations by the same wave with
1526 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1527 // to initiate writeback of any dirty cache lines of earlier writes by
1528 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1529 // writeback has completed.
1530 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1531 // Set SC bits to indicate system scope.
1533 Changed = true;
1534 break;
1535 case SIAtomicScope::AGENT:
1536 if (ST.hasGFX940Insts()) {
1537 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1538 // Set SC bits to indicate agent scope.
1540 Changed = true;
1541 }
1542 break;
1543 case SIAtomicScope::WORKGROUP:
1544 case SIAtomicScope::WAVEFRONT:
1545 case SIAtomicScope::SINGLETHREAD:
1546 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1547 // would writeback, and would require an otherwise unnecessary
1548 // "S_WAITCNT vmcnt(0)".
1549 break;
1550 default:
1551 llvm_unreachable("Unsupported synchronization scope");
1552 }
1553 }
1554
1555 if (Pos == Position::AFTER)
1556 --MI;
1557
1558 return Changed;
1559}
1560
1561bool SIGfx10CacheControl::enableLoadCacheBypass(
1562 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1563 SIAtomicAddrSpace AddrSpace) const {
1564 assert(MI->mayLoad() && !MI->mayStore());
1565 bool Changed = false;
1566
1567 if (canAffectGlobalAddrSpace(AddrSpace)) {
1568 switch (Scope) {
1569 case SIAtomicScope::SYSTEM:
1570 case SIAtomicScope::AGENT:
1571 // Set the L0 and L1 cache policies to MISS_EVICT.
1572 // Note: there is no L2 cache coherent bypass control at the ISA level.
1573 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1574 Changed |=
1575 enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
1576 break;
1577 case SIAtomicScope::WORKGROUP:
1578 // In WGP mode the waves of a work-group can be executing on either CU of
1579 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1580 // CU mode all waves of a work-group are on the same CU, and so the L0
1581 // does not need to be bypassed.
1582 if (!ST.isCuModeEnabled())
1583 Changed |= enableCPolBits(MI, CPol::GLC);
1584 break;
1585 case SIAtomicScope::WAVEFRONT:
1586 case SIAtomicScope::SINGLETHREAD:
1587 // No cache to bypass.
1588 break;
1589 default:
1590 llvm_unreachable("Unsupported synchronization scope");
1591 }
1592 }
1593
1594 /// The scratch address space does not need the global memory caches
1595 /// to be bypassed as all memory operations by the same thread are
1596 /// sequentially consistent, and no other thread can access scratch
1597 /// memory.
1598
1599 /// Other address spaces do not have a cache.
1600
1601 return Changed;
1602}
1603
1604bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1605 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1606 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1607
1608 // Only handle load and store, not atomic read-modify-write insructions. The
1609 // latter use glc to indicate if the atomic returns a result and so must not
1610 // be used for cache control.
1611 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1612
1613 // Only update load and store, not LLVM IR atomic read-modify-write
1614 // instructions. The latter are always marked as volatile so cannot sensibly
1615 // handle it as do not want to pessimize all atomics. Also they do not support
1616 // the nontemporal attribute.
1617 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1618
1619 bool Changed = false;
1620
1621 if (IsVolatile) {
1622 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1623 // and MISS_LRU for store instructions.
1624 // Note: there is no L2 cache coherent bypass control at the ISA level.
1625 if (Op == SIMemOp::LOAD) {
1626 Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1627 }
1628
1629 // GFX11: Set MALL NOALLOC for both load and store instructions.
1630 if (AMDGPU::isGFX11(ST))
1631 Changed |= enableCPolBits(MI, CPol::DLC);
1632
1633 // Ensure operation has completed at system scope to cause all volatile
1634 // operations to be visible outside the program in a global order. Do not
1635 // request cross address space as only the global address space can be
1636 // observable outside the program, so no need to cause a waitcnt for LDS
1637 // address space operations.
1638 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1639 Position::AFTER, AtomicOrdering::Unordered,
1640 /*AtomicsOnly=*/false);
1641 return Changed;
1642 }
1643
1644 if (IsNonTemporal) {
1645 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1646 // and L2 cache policy to STREAM.
1647 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1648 // to MISS_EVICT and the L2 cache policy to STREAM.
1649 if (Op == SIMemOp::STORE)
1650 Changed |= enableCPolBits(MI, CPol::GLC);
1651 Changed |= enableCPolBits(MI, CPol::SLC);
1652
1653 // GFX11: Set MALL NOALLOC for both load and store instructions.
1654 if (AMDGPU::isGFX11(ST))
1655 Changed |= enableCPolBits(MI, CPol::DLC);
1656
1657 return Changed;
1658 }
1659
1660 return Changed;
1661}
1662
1663bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1664 SIAtomicScope Scope,
1665 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1666 bool IsCrossAddrSpaceOrdering,
1667 Position Pos, AtomicOrdering Order,
1668 bool AtomicsOnly) const {
1669 bool Changed = false;
1670
1671 MachineBasicBlock &MBB = *MI->getParent();
1672 const DebugLoc &DL = MI->getDebugLoc();
1673
1674 if (Pos == Position::AFTER)
1675 ++MI;
1676
1677 bool VMCnt = false;
1678 bool VSCnt = false;
1679 bool LGKMCnt = false;
1680
1681 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1682 SIAtomicAddrSpace::NONE) {
1683 switch (Scope) {
1684 case SIAtomicScope::SYSTEM:
1685 case SIAtomicScope::AGENT:
1686 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1687 VMCnt |= true;
1688 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1689 VSCnt |= true;
1690 break;
1691 case SIAtomicScope::WORKGROUP:
1692 // In WGP mode the waves of a work-group can be executing on either CU of
1693 // the WGP. Therefore need to wait for operations to complete to ensure
1694 // they are visible to waves in the other CU as the L0 is per CU.
1695 // Otherwise in CU mode and all waves of a work-group are on the same CU
1696 // which shares the same L0. Note that we still need to wait when
1697 // performing a release in this mode to respect the transitivity of
1698 // happens-before, e.g. other waves of the workgroup must be able to
1699 // release the memory from another wave at a wider scope.
1700 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
1701 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1702 VMCnt |= true;
1703 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1704 VSCnt |= true;
1705 }
1706 break;
1707 case SIAtomicScope::WAVEFRONT:
1708 case SIAtomicScope::SINGLETHREAD:
1709 // The L0 cache keeps all memory operations in order for
1710 // work-items in the same wavefront.
1711 break;
1712 default:
1713 llvm_unreachable("Unsupported synchronization scope");
1714 }
1715 }
1716
1717 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1718 switch (Scope) {
1719 case SIAtomicScope::SYSTEM:
1720 case SIAtomicScope::AGENT:
1721 case SIAtomicScope::WORKGROUP:
1722 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1723 // not needed as LDS operations for all waves are executed in a total
1724 // global ordering as observed by all waves. Required if also
1725 // synchronizing with global/GDS memory as LDS operations could be
1726 // reordered with respect to later global/GDS memory operations of the
1727 // same wave.
1728 LGKMCnt |= IsCrossAddrSpaceOrdering;
1729 break;
1730 case SIAtomicScope::WAVEFRONT:
1731 case SIAtomicScope::SINGLETHREAD:
1732 // The LDS keeps all memory operations in order for
1733 // the same wavefront.
1734 break;
1735 default:
1736 llvm_unreachable("Unsupported synchronization scope");
1737 }
1738 }
1739
1740 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1741 switch (Scope) {
1742 case SIAtomicScope::SYSTEM:
1743 case SIAtomicScope::AGENT:
1744 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1745 // is not needed as GDS operations for all waves are executed in a total
1746 // global ordering as observed by all waves. Required if also
1747 // synchronizing with global/LDS memory as GDS operations could be
1748 // reordered with respect to later global/LDS memory operations of the
1749 // same wave.
1750 LGKMCnt |= IsCrossAddrSpaceOrdering;
1751 break;
1752 case SIAtomicScope::WORKGROUP:
1753 case SIAtomicScope::WAVEFRONT:
1754 case SIAtomicScope::SINGLETHREAD:
1755 // The GDS keeps all memory operations in order for
1756 // the same work-group.
1757 break;
1758 default:
1759 llvm_unreachable("Unsupported synchronization scope");
1760 }
1761 }
1762
1763 if (VMCnt || LGKMCnt) {
1764 unsigned WaitCntImmediate =
1766 VMCnt ? 0 : getVmcntBitMask(IV),
1768 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1769 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1770 .addImm(WaitCntImmediate);
1771 Changed = true;
1772 }
1773
1774 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1775 // at workgroup-scoped release operations that specify the LDS address space.
1776 // SIInsertWaitcnts will later replace this with a vmcnt().
1777 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1778 Scope == SIAtomicScope::WORKGROUP &&
1779 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1780 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1781 Changed = true;
1782 }
1783
1784 if (VSCnt) {
1785 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1786 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1787 .addImm(0);
1788 Changed = true;
1789 }
1790
1791 if (Pos == Position::AFTER)
1792 --MI;
1793
1794 return Changed;
1795}
1796
1797bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1798 SIAtomicScope Scope,
1799 SIAtomicAddrSpace AddrSpace,
1800 Position Pos) const {
1801 if (!InsertCacheInv)
1802 return false;
1803
1804 bool Changed = false;
1805
1806 MachineBasicBlock &MBB = *MI->getParent();
1807 const DebugLoc &DL = MI->getDebugLoc();
1808
1809 if (Pos == Position::AFTER)
1810 ++MI;
1811
1812 if (canAffectGlobalAddrSpace(AddrSpace)) {
1813 switch (Scope) {
1814 case SIAtomicScope::SYSTEM:
1815 case SIAtomicScope::AGENT:
1816 // The order of invalidates matter here. We must invalidate "outer in"
1817 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1818 // invalidated.
1819 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1820 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1821 Changed = true;
1822 break;
1823 case SIAtomicScope::WORKGROUP:
1824 // In WGP mode the waves of a work-group can be executing on either CU of
1825 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1826 // in CU mode and all waves of a work-group are on the same CU, and so the
1827 // L0 does not need to be invalidated.
1828 if (!ST.isCuModeEnabled()) {
1829 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1830 Changed = true;
1831 }
1832 break;
1833 case SIAtomicScope::WAVEFRONT:
1834 case SIAtomicScope::SINGLETHREAD:
1835 // No cache to invalidate.
1836 break;
1837 default:
1838 llvm_unreachable("Unsupported synchronization scope");
1839 }
1840 }
1841
1842 /// The scratch address space does not need the global memory cache
1843 /// to be flushed as all memory operations by the same thread are
1844 /// sequentially consistent, and no other thread can access scratch
1845 /// memory.
1846
1847 /// Other address spaces do not have a cache.
1848
1849 if (Pos == Position::AFTER)
1850 --MI;
1851
1852 return Changed;
1853}
1854
1855bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1856 AMDGPU::CPol::CPol Value) const {
1857 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1858 if (!CPol)
1859 return false;
1860
1861 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1862 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1863 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1864 return true;
1865 }
1866
1867 return false;
1868}
1869
1870bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1871 AMDGPU::CPol::CPol Value) const {
1872 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1873 if (!CPol)
1874 return false;
1875
1876 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1877 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1878 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1879 return true;
1880 }
1881
1882 return false;
1883}
1884
1885bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1886 const MachineBasicBlock::iterator MI) const {
1887 // TODO: implement flag for frontend to give us a hint not to insert waits.
1888
1889 MachineBasicBlock &MBB = *MI->getParent();
1890 const DebugLoc &DL = MI->getDebugLoc();
1891
1892 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
1893 if (ST.hasImageInsts()) {
1894 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
1895 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
1896 }
1897 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
1898 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
1899
1900 return true;
1901}
1902
1903bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1904 SIAtomicScope Scope,
1905 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1906 bool IsCrossAddrSpaceOrdering,
1907 Position Pos, AtomicOrdering Order,
1908 bool AtomicsOnly) const {
1909 bool Changed = false;
1910
1911 MachineBasicBlock &MBB = *MI->getParent();
1912 const DebugLoc &DL = MI->getDebugLoc();
1913
1914 bool LOADCnt = false;
1915 bool DSCnt = false;
1916 bool STORECnt = false;
1917
1918 if (Pos == Position::AFTER)
1919 ++MI;
1920
1921 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1922 SIAtomicAddrSpace::NONE) {
1923 switch (Scope) {
1924 case SIAtomicScope::SYSTEM:
1925 case SIAtomicScope::AGENT:
1926 case SIAtomicScope::CLUSTER:
1927 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1928 LOADCnt |= true;
1929 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1930 STORECnt |= true;
1931 break;
1932 case SIAtomicScope::WORKGROUP:
1933 // GFX12.0:
1934 // In WGP mode the waves of a work-group can be executing on either CU
1935 // of the WGP. Therefore need to wait for operations to complete to
1936 // ensure they are visible to waves in the other CU as the L0 is per CU.
1937 //
1938 // Otherwise in CU mode and all waves of a work-group are on the same CU
1939 // which shares the same L0. Note that we still need to wait when
1940 // performing a release in this mode to respect the transitivity of
1941 // happens-before, e.g. other waves of the workgroup must be able to
1942 // release the memory from another wave at a wider scope.
1943 //
1944 // GFX12.5:
1945 // CU$ has two ports. To ensure operations are visible at the workgroup
1946 // level, we need to ensure all operations in this port have completed
1947 // so the other SIMDs in the WG can see them. There is no ordering
1948 // guarantee between the ports.
1949 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1950 isReleaseOrStronger(Order)) {
1951 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1952 LOADCnt |= true;
1953 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1954 STORECnt |= true;
1955 }
1956 break;
1957 case SIAtomicScope::WAVEFRONT:
1958 case SIAtomicScope::SINGLETHREAD:
1959 // The L0 cache keeps all memory operations in order for
1960 // work-items in the same wavefront.
1961 break;
1962 default:
1963 llvm_unreachable("Unsupported synchronization scope");
1964 }
1965 }
1966
1967 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1968 switch (Scope) {
1969 case SIAtomicScope::SYSTEM:
1970 case SIAtomicScope::AGENT:
1971 case SIAtomicScope::CLUSTER:
1972 case SIAtomicScope::WORKGROUP:
1973 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1974 // not needed as LDS operations for all waves are executed in a total
1975 // global ordering as observed by all waves. Required if also
1976 // synchronizing with global/GDS memory as LDS operations could be
1977 // reordered with respect to later global/GDS memory operations of the
1978 // same wave.
1979 DSCnt |= IsCrossAddrSpaceOrdering;
1980 break;
1981 case SIAtomicScope::WAVEFRONT:
1982 case SIAtomicScope::SINGLETHREAD:
1983 // The LDS keeps all memory operations in order for
1984 // the same wavefront.
1985 break;
1986 default:
1987 llvm_unreachable("Unsupported synchronization scope");
1988 }
1989 }
1990
1991 if (LOADCnt) {
1992 // Acquire sequences only need to wait on the previous atomic operation.
1993 // e.g. a typical sequence looks like
1994 // atomic load
1995 // (wait)
1996 // global_inv
1997 //
1998 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1999 // to be tracked using loadcnt.
2000 //
2001 // This also applies to fences. Fences cannot pair with an instruction
2002 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2003 if (!AtomicsOnly && ST.hasImageInsts()) {
2004 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2005 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2006 }
2007 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2008 Changed = true;
2009 }
2010
2011 if (STORECnt) {
2012 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2013 Changed = true;
2014 }
2015
2016 if (DSCnt) {
2017 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2018 Changed = true;
2019 }
2020
2021 if (Pos == Position::AFTER)
2022 --MI;
2023
2024 return Changed;
2025}
2026
2027bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2028 SIAtomicScope Scope,
2029 SIAtomicAddrSpace AddrSpace,
2030 Position Pos) const {
2031 if (!InsertCacheInv)
2032 return false;
2033
2034 MachineBasicBlock &MBB = *MI->getParent();
2035 const DebugLoc &DL = MI->getDebugLoc();
2036
2037 /// The scratch address space does not need the global memory cache
2038 /// to be flushed as all memory operations by the same thread are
2039 /// sequentially consistent, and no other thread can access scratch
2040 /// memory.
2041
2042 /// Other address spaces do not have a cache.
2043 if (!canAffectGlobalAddrSpace(AddrSpace))
2044 return false;
2045
2047 switch (Scope) {
2048 case SIAtomicScope::SYSTEM:
2049 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2050 break;
2051 case SIAtomicScope::AGENT:
2052 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2053 break;
2054 case SIAtomicScope::CLUSTER:
2055 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2056 break;
2057 case SIAtomicScope::WORKGROUP:
2058 // GFX12.0:
2059 // In WGP mode the waves of a work-group can be executing on either CU of
2060 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2061 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2062 // so the L0 does not need to be invalidated.
2063 //
2064 // GFX12.5 has a shared WGP$, so no invalidates are required.
2065 if (ST.isCuModeEnabled())
2066 return false;
2067
2068 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2069 break;
2070 case SIAtomicScope::WAVEFRONT:
2071 case SIAtomicScope::SINGLETHREAD:
2072 // No cache to invalidate.
2073 return false;
2074 default:
2075 llvm_unreachable("Unsupported synchronization scope");
2076 }
2077
2078 if (Pos == Position::AFTER)
2079 ++MI;
2080
2081 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2082
2083 if (Pos == Position::AFTER)
2084 --MI;
2085
2086 // Target requires a waitcnt to ensure that the proceeding INV has completed
2087 // as it may get reorded with following load instructions.
2088 if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
2089 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD,
2090 /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire,
2091 /*AtomicsOnly=*/false);
2092
2093 if (Pos == Position::AFTER)
2094 --MI;
2095 }
2096
2097 return true;
2098}
2099
2100bool SIGfx12CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
2101 SIAtomicScope Scope,
2102 SIAtomicAddrSpace AddrSpace,
2103 Position Pos) const {
2104 // The scratch address space does not need the global memory cache
2105 // writeback as all memory operations by the same thread are
2106 // sequentially consistent, and no other thread can access scratch
2107 // memory.
2108 if (!canAffectGlobalAddrSpace(AddrSpace))
2109 return false;
2110
2111 bool Changed = false;
2112 MachineBasicBlock &MBB = *MI->getParent();
2113 const DebugLoc &DL = MI->getDebugLoc();
2114
2115 if (Pos == Position::AFTER)
2116 ++MI;
2117
2118 // global_wb is only necessary at system scope for GFX12.0,
2119 // they're also necessary at device scope for GFX12.5 as stores
2120 // cannot report completion earlier than L2.
2121 //
2122 // Emitting it for lower scopes is a slow no-op, so we omit it
2123 // for performance.
2124 std::optional<AMDGPU::CPol::CPol> NeedsWB;
2125 switch (Scope) {
2126 case SIAtomicScope::SYSTEM:
2127 NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2128 break;
2129 case SIAtomicScope::AGENT:
2130 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2131 if (ST.hasGFX1250Insts())
2132 NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2133 break;
2134 case SIAtomicScope::CLUSTER:
2135 case SIAtomicScope::WORKGROUP:
2136 case SIAtomicScope::WAVEFRONT:
2137 case SIAtomicScope::SINGLETHREAD:
2138 break;
2139 case SIAtomicScope::NONE:
2140 llvm_unreachable("Unsupported synchronization scope");
2141 break;
2142 }
2143
2144 if (NeedsWB) {
2145 // Target requires a waitcnt to ensure that the proceeding store
2146 // proceeding store/rmw operations have completed in L2 so their data will
2147 // be written back by the WB instruction.
2148 if (ST.hasINVWBL2WaitCntRequirement()) {
2149 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2150 /*IsCrossAddrSpaceOrdering=*/false, Pos,
2151 AtomicOrdering::Release,
2152 /*AtomicsOnly=*/false);
2153 }
2154
2155 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
2156 Changed = true;
2157 }
2158
2159 if (Pos == Position::AFTER)
2160 --MI;
2161
2162 return Changed;
2163}
2164
2165bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
2166 // On GFX12.5, set the NV CPol bit.
2167 if (!ST.hasGFX1250Insts())
2168 return false;
2169 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2170 if (!CPol)
2171 return false;
2172 CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
2173 return true;
2174}
2175
2176bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2177 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2178 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2179
2180 // Only handle load and store, not atomic read-modify-write instructions.
2181 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2182
2183 // Only update load and store, not LLVM IR atomic read-modify-write
2184 // instructions. The latter are always marked as volatile so cannot sensibly
2185 // handle it as do not want to pessimize all atomics. Also they do not support
2186 // the nontemporal attribute.
2187 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2188
2189 bool Changed = false;
2190
2191 if (IsLastUse) {
2192 // Set last-use hint.
2193 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2194 } else if (IsNonTemporal) {
2195 // Set non-temporal hint for all cache levels.
2196 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2197 }
2198
2199 if (IsVolatile) {
2200 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2201
2202 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2204 MachineBasicBlock &MBB = *MI->getParent();
2205 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2206 Changed = true;
2207 }
2208
2209 // Ensure operation has completed at system scope to cause all volatile
2210 // operations to be visible outside the program in a global order. Do not
2211 // request cross address space as only the global address space can be
2212 // observable outside the program, so no need to cause a waitcnt for LDS
2213 // address space operations.
2214 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2215 Position::AFTER, AtomicOrdering::Unordered,
2216 /*AtomicsOnly=*/false);
2217 }
2218
2219 return Changed;
2220}
2221
2222bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2223 assert(MI.mayStore() && "Not a Store inst");
2224 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2225 bool Changed = false;
2226
2227 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2229 MachineBasicBlock &MBB = *MI.getParent();
2230 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2231 Changed = true;
2232 }
2233
2234 // Remaining fixes do not apply to RMWs.
2235 if (IsRMW)
2236 return Changed;
2237
2238 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2239 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2240 return Changed;
2241 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2242
2243 // GFX12.0 only: Extra waits needed before system scope stores.
2244 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2245 Scope == CPol::SCOPE_SYS)
2246 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2247
2248 return Changed;
2249}
2250
2251bool SIGfx12CacheControl::finalizeLoad(MachineBasicBlock::iterator &MI) const {
2252 if (!SIInstrInfo::isLoadMonitor(MI->getOpcode()))
2253 return false;
2254
2255 // load_monitor instructions need at least SCOPE_SE to ensure L2 is hit.
2256 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
2257 assert(CPol && "load_monitor must have a cpol operand");
2259 return setScope(MI, AMDGPU::CPol::SCOPE_SE);
2260 return false;
2261}
2262
2263bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2264 if (!ST.hasGFX1250Insts())
2265 return false;
2266
2267 // Cooperative atomics need to be SCOPE_DEV or higher.
2268 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2269 assert(CPol && "No CPol operand?");
2270 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2271 if (Scope < CPol::SCOPE_DEV)
2272 return setScope(MI, CPol::SCOPE_DEV);
2273 return false;
2274}
2275
2276bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2277 SIAtomicScope Scope,
2278 SIAtomicAddrSpace AddrSpace) const {
2279 bool Changed = false;
2280
2281 if (canAffectGlobalAddrSpace(AddrSpace)) {
2282 switch (Scope) {
2283 case SIAtomicScope::SYSTEM:
2284 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2285 break;
2286 case SIAtomicScope::AGENT:
2287 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2288 break;
2289 case SIAtomicScope::CLUSTER:
2290 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2291 break;
2292 case SIAtomicScope::WORKGROUP:
2293 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2294 // different CUs that access different L0s.
2295 if (!ST.isCuModeEnabled())
2296 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2297 break;
2298 case SIAtomicScope::WAVEFRONT:
2299 case SIAtomicScope::SINGLETHREAD:
2300 // No cache to bypass.
2301 break;
2302 default:
2303 llvm_unreachable("Unsupported synchronization scope");
2304 }
2305 }
2306
2307 // The scratch address space does not need the global memory caches
2308 // to be bypassed as all memory operations by the same thread are
2309 // sequentially consistent, and no other thread can access scratch
2310 // memory.
2311
2312 // Other address spaces do not have a cache.
2313
2314 return Changed;
2315}
2316
2317bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2318 if (AtomicPseudoMIs.empty())
2319 return false;
2320
2321 for (auto &MI : AtomicPseudoMIs)
2322 MI->eraseFromParent();
2323
2324 AtomicPseudoMIs.clear();
2325 return true;
2326}
2327
2328bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2330 assert(MI->mayLoad() && !MI->mayStore());
2331
2332 LLVM_DEBUG(dbgs() << "Expanding load: " << *MI);
2333
2334 bool Changed = false;
2335
2336 if (MOI.isAtomic()) {
2337 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2338 << ", scope=" << toString(MOI.getScope())
2339 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2340 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2341 const AtomicOrdering Order = MOI.getOrdering();
2342 if (Order == AtomicOrdering::Monotonic ||
2343 Order == AtomicOrdering::Acquire ||
2344 Order == AtomicOrdering::SequentiallyConsistent) {
2345 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2346 MOI.getOrderingAddrSpace());
2347 }
2348
2349 // Handle cooperative atomics after cache bypass step, as it may override
2350 // the scope of the instruction to a greater scope.
2351 if (MOI.isCooperative())
2352 Changed |= CC->handleCooperativeAtomic(*MI);
2353
2354 if (Order == AtomicOrdering::SequentiallyConsistent)
2355 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2356 SIMemOp::LOAD | SIMemOp::STORE,
2357 MOI.getIsCrossAddressSpaceOrdering(),
2358 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2359
2360 if (Order == AtomicOrdering::Acquire ||
2361 Order == AtomicOrdering::SequentiallyConsistent) {
2362 // The wait below only needs to wait on the prior atomic.
2363 Changed |=
2364 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2365 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2366 Position::AFTER, Order, /*AtomicsOnly=*/true);
2367 if (!MOI.isAVNone()) {
2368 Changed |= CC->insertAcquire(
2369 MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER);
2370 }
2371 }
2372
2373 Changed |= CC->finalizeLoad(MI);
2374 return Changed;
2375 }
2376
2377 // Atomic instructions already bypass caches to the scope specified by the
2378 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2379 // instructions need additional treatment.
2380 Changed |= CC->enableVolatileAndOrNonTemporal(
2381 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2382 MOI.isNonTemporal(), MOI.isLastUse());
2383
2384 Changed |= CC->finalizeLoad(MI);
2385 return Changed;
2386}
2387
2388bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2390 assert(!MI->mayLoad() && MI->mayStore());
2391
2392 LLVM_DEBUG(dbgs() << "Expanding store: " << *MI);
2393
2394 bool Changed = false;
2395 // FIXME: Necessary hack because iterator can lose track of the store.
2396 MachineInstr &StoreMI = *MI;
2397
2398 if (MOI.isAtomic()) {
2399 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2400 << ", scope=" << toString(MOI.getScope())
2401 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2402 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2403 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2404 MOI.getOrdering() == AtomicOrdering::Release ||
2405 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2406 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2407 MOI.getOrderingAddrSpace());
2408 }
2409
2410 // Handle cooperative atomics after cache bypass step, as it may override
2411 // the scope of the instruction to a greater scope.
2412 if (MOI.isCooperative())
2413 Changed |= CC->handleCooperativeAtomic(*MI);
2414
2415 if (MOI.getOrdering() == AtomicOrdering::Release ||
2416 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2417 Changed |=
2418 CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2419 MOI.getIsCrossAddressSpaceOrdering(),
2420 Position::BEFORE, MOI.isAVNone());
2421 }
2422
2423 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2424 return Changed;
2425 }
2426
2427 // Atomic instructions already bypass caches to the scope specified by the
2428 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2429 // need additional treatment.
2430 Changed |= CC->enableVolatileAndOrNonTemporal(
2431 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2432 MOI.isNonTemporal());
2433
2434 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2435 // instruction field, do not confuse it with atomic scope.
2436 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2437 return Changed;
2438}
2439
2440bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2442 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2443
2444 LLVM_DEBUG(dbgs() << "Expanding atomic fence: " << *MI);
2445
2446 AtomicPseudoMIs.push_back(MI);
2447 bool Changed = false;
2448
2449 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2450
2451 if (MOI.isAtomic()) {
2452 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2453 << ", scope=" << toString(MOI.getScope())
2454 << ", ordering-AS=" << OrderingAddrSpace << "\n");
2455 const AtomicOrdering Order = MOI.getOrdering();
2456 if (Order == AtomicOrdering::Acquire) {
2457 // Acquire fences only need to wait on the previous atomic they pair with.
2458 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2459 SIMemOp::LOAD | SIMemOp::STORE,
2460 MOI.getIsCrossAddressSpaceOrdering(),
2461 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2462 }
2463
2464 if (Order == AtomicOrdering::Release ||
2465 Order == AtomicOrdering::AcquireRelease ||
2466 Order == AtomicOrdering::SequentiallyConsistent) {
2467 /// TODO: This relies on a barrier always generating a waitcnt
2468 /// for LDS to ensure it is not reordered with the completion of
2469 /// the proceeding LDS operations. If barrier had a memory
2470 /// ordering and memory scope, then library does not need to
2471 /// generate a fence. Could add support in this file for
2472 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2473 /// adding S_WAITCNT before a S_BARRIER.
2474 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2475 MOI.getIsCrossAddressSpaceOrdering(),
2476 Position::BEFORE, MOI.isAVNone());
2477 }
2478
2479 // TODO: If both release and invalidate are happening they could be combined
2480 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2481 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2482 // track cache invalidate and write back instructions.
2483
2484 if ((Order == AtomicOrdering::Acquire ||
2485 Order == AtomicOrdering::AcquireRelease ||
2486 Order == AtomicOrdering::SequentiallyConsistent) &&
2487 !MOI.isAVNone()) {
2488 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2489 Position::BEFORE);
2490 }
2491
2492 return Changed;
2493 }
2494
2495 return Changed;
2496}
2497
2498bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2500 assert(MI->mayLoad() && MI->mayStore());
2501
2502 LLVM_DEBUG(dbgs() << "Expanding atomic cmpxchg/rmw: " << *MI);
2503
2504 bool Changed = false;
2505 MachineInstr &RMWMI = *MI;
2506
2507 if (MOI.isAtomic()) {
2508 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2509 << ", failure-ordering="
2510 << toIRString(MOI.getFailureOrdering())
2511 << ", scope=" << toString(MOI.getScope())
2512 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2513 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2514 const AtomicOrdering Order = MOI.getOrdering();
2515 if (Order == AtomicOrdering::Monotonic ||
2516 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2517 Order == AtomicOrdering::AcquireRelease ||
2518 Order == AtomicOrdering::SequentiallyConsistent) {
2519 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2520 MOI.getInstrAddrSpace());
2521 }
2522
2523 if (Order == AtomicOrdering::Release ||
2524 Order == AtomicOrdering::AcquireRelease ||
2525 Order == AtomicOrdering::SequentiallyConsistent ||
2526 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2527 Changed |=
2528 CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2529 MOI.getIsCrossAddressSpaceOrdering(),
2530 Position::BEFORE, MOI.isAVNone());
2531 }
2532
2533 if (Order == AtomicOrdering::Acquire ||
2534 Order == AtomicOrdering::AcquireRelease ||
2535 Order == AtomicOrdering::SequentiallyConsistent ||
2536 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2537 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2538 // Only wait on the previous atomic.
2539 Changed |=
2540 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2541 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2542 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2543 Order, /*AtomicsOnly=*/true);
2544 if (!MOI.isAVNone()) {
2545 Changed |= CC->insertAcquire(
2546 MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER);
2547 }
2548 }
2549
2550 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2551 return Changed;
2552 }
2553
2554 return Changed;
2555}
2556
2557bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2559 assert(MI->mayLoad() && MI->mayStore());
2560
2561 LLVM_DEBUG(dbgs() << "Expanding LDS DMA: " << *MI);
2562
2563 // The volatility or nontemporal-ness of the operation is a
2564 // function of the global memory, not the LDS.
2565 SIMemOp OpKind =
2566 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2567
2568 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2569 // stores. The operation is treated as a volatile/nontemporal store
2570 // to its second argument.
2571 return CC->enableVolatileAndOrNonTemporal(
2572 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2573 MOI.isNonTemporal(), MOI.isLastUse());
2574}
2575
2576bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2577 const MachineModuleInfo &MMI =
2578 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2579 return SIMemoryLegalizer(MMI).run(MF);
2580}
2581
2582PreservedAnalyses
2586 .getCachedResult<MachineModuleAnalysis>(
2587 *MF.getFunction().getParent());
2588 assert(MMI && "MachineModuleAnalysis must be available");
2589 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2590 return PreservedAnalyses::all();
2592}
2593
2594bool SIMemoryLegalizer::run(MachineFunction &MF) {
2595 bool Changed = false;
2596
2597 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2598 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
2599 MF.getFunction());
2600 CC = SICacheControl::create(ST);
2601
2602 for (auto &MBB : MF) {
2603 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2604
2605 // Unbundle instructions after the post-RA scheduler.
2606 if (MI->isBundle() && MI->mayLoadOrStore()) {
2607 MachineBasicBlock::instr_iterator II(MI->getIterator());
2608 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2609 I != E && I->isBundledWithPred(); ++I) {
2610 I->unbundleFromPred();
2611 for (MachineOperand &MO : I->operands())
2612 if (MO.isReg())
2613 MO.setIsInternalRead(false);
2614 }
2615
2616 MI = MI->eraseFromParent();
2617 }
2618
2619 if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
2620 if (const auto &MOI = MOA.getLoadInfo(MI))
2621 Changed |= expandLoad(*MOI, MI);
2622 else if (const auto &MOI = MOA.getStoreInfo(MI))
2623 Changed |= expandStore(*MOI, MI);
2624 else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
2625 Changed |= expandLDSDMA(*MOI, MI);
2626 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2627 Changed |= expandAtomicFence(*MOI, MI);
2628 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2629 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2630 }
2631
2633 Changed |= CC->handleNonVolatile(*MI);
2634 }
2635 }
2636
2637 Changed |= removeAtomicPseudoMIs();
2638 return Changed;
2639}
2640
2641INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2642
2643char SIMemoryLegalizerLegacy::ID = 0;
2644char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2645
2647 return new SIMemoryLegalizerLegacy();
2648}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
static bool isNonVolatileMemoryAccess(const MachineInstr &MI)
#define PASS_NAME
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST)
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
A description of a memory reference used in the backend.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isVMEM(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isBUF(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isAtomic(const MachineInstr &MI)
static bool isLoadMonitor(unsigned Opc)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:128
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
bool isGFX10(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()
bool isStrongerThan(AtomicOrdering AO, AtomicOrdering Other)
Returns true if ao is stronger than other as defined by the AtomicOrdering lattice,...