LLVM 23.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30#include "llvm/Support/Debug.h"
32
33using namespace llvm;
34using namespace llvm::AMDGPU;
35
36#define DEBUG_TYPE "si-memory-legalizer"
37#define PASS_NAME "SI Memory Legalizer"
38
40 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
41 cl::desc("Use this to skip inserting cache invalidating instructions."));
42
43namespace {
44
46
47/// Memory operation flags. Can be ORed together.
48enum class SIMemOp {
49 NONE = 0u,
50 LOAD = 1u << 0,
51 STORE = 1u << 1,
52 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
53};
54
55/// Position to insert a new instruction relative to an existing
56/// instruction.
57enum class Position {
58 BEFORE,
59 AFTER
60};
61
62/// The atomic synchronization scopes supported by the AMDGPU target.
63enum class SIAtomicScope {
64 NONE,
65 SINGLETHREAD,
66 WAVEFRONT,
67 WORKGROUP,
68 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
69 AGENT,
70 SYSTEM
71};
72
73/// The distinct address spaces supported by the AMDGPU target for
74/// atomic memory operation. Can be ORed together.
75enum class SIAtomicAddrSpace {
76 NONE = 0u,
77 GLOBAL = 1u << 0,
78 LDS = 1u << 1,
79 SCRATCH = 1u << 2,
80 GDS = 1u << 3,
81 OTHER = 1u << 4,
82
83 /// The address spaces that can be accessed by a FLAT instruction.
84 FLAT = GLOBAL | LDS | SCRATCH,
85
86 /// The address spaces that support atomic instructions.
87 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
88
89 /// All address spaces.
90 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
91
92 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
93};
94
95#ifndef NDEBUG
96static StringRef toString(SIAtomicScope S) {
97 switch (S) {
98 case SIAtomicScope::NONE:
99 return "none";
100 case SIAtomicScope::SINGLETHREAD:
101 return "singlethread";
102 case SIAtomicScope::WAVEFRONT:
103 return "wavefront";
104 case SIAtomicScope::WORKGROUP:
105 return "workgroup";
106 case SIAtomicScope::CLUSTER:
107 return "cluster";
108 case SIAtomicScope::AGENT:
109 return "agent";
110 case SIAtomicScope::SYSTEM:
111 return "system";
112 }
113 llvm_unreachable("unknown atomic scope");
114}
115
116static raw_ostream &operator<<(raw_ostream &OS, SIAtomicAddrSpace AS) {
117 if (AS == SIAtomicAddrSpace::NONE) {
118 OS << "none";
119 return OS;
120 }
121 ListSeparator LS("|");
122 if ((AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE)
123 OS << LS << "global";
124 if ((AS & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE)
125 OS << LS << "lds";
126 if ((AS & SIAtomicAddrSpace::SCRATCH) != SIAtomicAddrSpace::NONE)
127 OS << LS << "scratch";
128 if ((AS & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE)
129 OS << LS << "gds";
130 if ((AS & SIAtomicAddrSpace::OTHER) != SIAtomicAddrSpace::NONE)
131 OS << LS << "other";
132 return OS;
133}
134#endif
135
136class SIMemOpInfo final {
137private:
138
139 friend class SIMemOpAccess;
140
141 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
142 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
143 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
144 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
145 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
146 bool IsCrossAddressSpaceOrdering = false;
147 bool IsVolatile = false;
148 bool IsNonTemporal = false;
149 bool IsLastUse = false;
150 bool IsCooperative = false;
151
152 // TODO: Should we assume Cooperative=true if no MMO is present?
153 SIMemOpInfo(
154 const GCNSubtarget &ST,
155 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
156 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
157 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
158 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
159 bool IsCrossAddressSpaceOrdering = true,
160 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
161 bool IsVolatile = false, bool IsNonTemporal = false,
162 bool IsLastUse = false, bool IsCooperative = false,
163 bool CanDemoteWorkgroupToWavefront = false)
164 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
165 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
166 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
167 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
168 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
169
170 if (Ordering == AtomicOrdering::NotAtomic) {
171 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
172 assert(Scope == SIAtomicScope::NONE &&
173 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
174 !IsCrossAddressSpaceOrdering &&
175 FailureOrdering == AtomicOrdering::NotAtomic);
176 return;
177 }
178
179 assert(Scope != SIAtomicScope::NONE &&
180 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
181 SIAtomicAddrSpace::NONE &&
182 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
183 SIAtomicAddrSpace::NONE);
184
185 // There is also no cross address space ordering if the ordering
186 // address space is the same as the instruction address space and
187 // only contains a single address space.
188 if ((OrderingAddrSpace == InstrAddrSpace) &&
189 isPowerOf2_32(uint32_t(InstrAddrSpace)))
190 this->IsCrossAddressSpaceOrdering = false;
191
192 // Limit the scope to the maximum supported by the instruction's address
193 // spaces.
194 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
195 SIAtomicAddrSpace::NONE) {
196 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
197 } else if ((InstrAddrSpace &
198 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
199 SIAtomicAddrSpace::NONE) {
200 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
201 } else if ((InstrAddrSpace &
202 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
203 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
204 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
205 }
206
207 // On targets that have no concept of a workgroup cluster, use
208 // AGENT scope as a conservatively correct alternative.
209 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
210 this->Scope = SIAtomicScope::AGENT;
211
212 // When max flat work-group size is at most the wavefront size, the
213 // work-group fits in a single wave, so LLVM workgroup scope matches
214 // wavefront scope. Demote workgroup → wavefront here for fences and for
215 // atomics with ordering stronger than monotonic.
216 if (CanDemoteWorkgroupToWavefront &&
217 this->Scope == SIAtomicScope::WORKGROUP &&
218 (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
219 llvm::isStrongerThan(this->FailureOrdering,
220 AtomicOrdering::Monotonic)))
221 this->Scope = SIAtomicScope::WAVEFRONT;
222 }
223
224public:
225 /// \returns Atomic synchronization scope of the machine instruction used to
226 /// create this SIMemOpInfo.
227 SIAtomicScope getScope() const {
228 return Scope;
229 }
230
231 /// \returns Ordering constraint of the machine instruction used to
232 /// create this SIMemOpInfo.
233 AtomicOrdering getOrdering() const {
234 return Ordering;
235 }
236
237 /// \returns Failure ordering constraint of the machine instruction used to
238 /// create this SIMemOpInfo.
239 AtomicOrdering getFailureOrdering() const {
240 return FailureOrdering;
241 }
242
243 /// \returns The address spaces be accessed by the machine
244 /// instruction used to create this SIMemOpInfo.
245 SIAtomicAddrSpace getInstrAddrSpace() const {
246 return InstrAddrSpace;
247 }
248
249 /// \returns The address spaces that must be ordered by the machine
250 /// instruction used to create this SIMemOpInfo.
251 SIAtomicAddrSpace getOrderingAddrSpace() const {
252 return OrderingAddrSpace;
253 }
254
255 /// \returns Return true iff memory ordering of operations on
256 /// different address spaces is required.
257 bool getIsCrossAddressSpaceOrdering() const {
258 return IsCrossAddressSpaceOrdering;
259 }
260
261 /// \returns True if memory access of the machine instruction used to
262 /// create this SIMemOpInfo is volatile, false otherwise.
263 bool isVolatile() const {
264 return IsVolatile;
265 }
266
267 /// \returns True if memory access of the machine instruction used to
268 /// create this SIMemOpInfo is nontemporal, false otherwise.
269 bool isNonTemporal() const {
270 return IsNonTemporal;
271 }
272
273 /// \returns True if memory access of the machine instruction used to
274 /// create this SIMemOpInfo is last use, false otherwise.
275 bool isLastUse() const { return IsLastUse; }
276
277 /// \returns True if this is a cooperative load or store atomic.
278 bool isCooperative() const { return IsCooperative; }
279
280 /// \returns True if ordering constraint of the machine instruction used to
281 /// create this SIMemOpInfo is unordered or higher, false otherwise.
282 bool isAtomic() const {
283 return Ordering != AtomicOrdering::NotAtomic;
284 }
285
286};
287
288class SIMemOpAccess final {
289private:
290 const AMDGPUMachineModuleInfo *MMI = nullptr;
291 const GCNSubtarget &ST;
292 const bool CanDemoteWorkgroupToWavefront;
293
294 /// Reports unsupported message \p Msg for \p MI to LLVM context.
295 void reportUnsupported(const MachineBasicBlock::iterator &MI,
296 const char *Msg) const;
297
298 /// Inspects the target synchronization scope \p SSID and determines
299 /// the SI atomic scope it corresponds to, the address spaces it
300 /// covers, and whether the memory ordering applies between address
301 /// spaces.
302 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
303 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
304
305 /// \return Return a bit set of the address spaces accessed by \p AS.
306 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
307
308 /// \returns Info constructed from \p MI, which has at least machine memory
309 /// operand.
310 std::optional<SIMemOpInfo>
311 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
312
313public:
314 /// Construct class to support accessing the machine memory operands
315 /// of instructions in the machine function \p MF.
316 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
317 const Function &F);
318
319 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
320 std::optional<SIMemOpInfo>
322
323 /// \returns Store info if \p MI is a store operation, "std::nullopt"
324 /// otherwise.
325 std::optional<SIMemOpInfo>
326 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
327
328 /// \returns Atomic fence info if \p MI is an atomic fence operation,
329 /// "std::nullopt" otherwise.
330 std::optional<SIMemOpInfo>
331 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
332
333 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
334 /// rmw operation, "std::nullopt" otherwise.
335 std::optional<SIMemOpInfo>
336 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
337
338 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
339 /// along with an indication of whether this is a load or store. If it is not
340 /// a direct-to-LDS operation, returns std::nullopt.
341 std::optional<SIMemOpInfo>
342 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
343};
344
345class SICacheControl {
346protected:
347
348 /// AMDGPU subtarget info.
349 const GCNSubtarget &ST;
350
351 /// Instruction info.
352 const SIInstrInfo *TII = nullptr;
353
354 IsaVersion IV;
355
356 /// Whether to insert cache invalidating instructions.
357 bool InsertCacheInv;
358
359 SICacheControl(const GCNSubtarget &ST);
360
361 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
362 /// \returns Returns true if \p MI is modified, false otherwise.
363 bool enableCPolBits(const MachineBasicBlock::iterator MI,
364 unsigned Bits) const;
365
366 /// Check if any atomic operation on AS can affect memory accessible via the
367 /// global address space.
368 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
369
370public:
371 using CPol = AMDGPU::CPol::CPol;
372
373 /// Create a cache control for the subtarget \p ST.
374 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
375
376 /// Update \p MI memory load instruction to bypass any caches up to
377 /// the \p Scope memory scope for address spaces \p
378 /// AddrSpace. Return true iff the instruction was modified.
379 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const = 0;
382
383 /// Update \p MI memory store instruction to bypass any caches up to
384 /// the \p Scope memory scope for address spaces \p
385 /// AddrSpace. Return true iff the instruction was modified.
386 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
387 SIAtomicScope Scope,
388 SIAtomicAddrSpace AddrSpace) const = 0;
389
390 /// Update \p MI memory read-modify-write instruction to bypass any caches up
391 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
392 /// iff the instruction was modified.
393 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394 SIAtomicScope Scope,
395 SIAtomicAddrSpace AddrSpace) const = 0;
396
397 /// Update \p MI memory instruction of kind \p Op associated with address
398 /// spaces \p AddrSpace to indicate it is volatile and/or
399 /// nontemporal/last-use. Return true iff the instruction was modified.
400 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401 SIAtomicAddrSpace AddrSpace,
402 SIMemOp Op, bool IsVolatile,
403 bool IsNonTemporal,
404 bool IsLastUse = false) const = 0;
405
406 /// Add final touches to a `mayStore` instruction \p MI, which may be a
407 /// Store or RMW instruction.
408 /// FIXME: This takes a MI because iterators aren't handled properly. When
409 /// this is called, they often point to entirely different insts. Thus we back
410 /// up the inst early and pass it here instead.
411 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
412 return false;
413 };
414
415 /// Handle cooperative load/store atomics.
416 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
418 "cooperative atomics are not available on this architecture");
419 }
420
421 /// Inserts any necessary instructions at position \p Pos relative
422 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
423 /// \p Op associated with address spaces \p AddrSpace have completed. Used
424 /// between memory instructions to enforce the order they become visible as
425 /// observed by other memory instructions executing in memory scope \p Scope.
426 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
427 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
428 /// that are used by atomic instructions.
429 /// Returns true iff any instructions inserted.
430 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
431 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
432 bool IsCrossAddrSpaceOrdering, Position Pos,
433 AtomicOrdering Order, bool AtomicsOnly) const = 0;
434
435 /// Inserts any necessary instructions at position \p Pos relative to
436 /// instruction \p MI to ensure any subsequent memory instructions of this
437 /// thread with address spaces \p AddrSpace will observe the previous memory
438 /// operations by any thread for memory scopes up to memory scope \p Scope .
439 /// Returns true iff any instructions inserted.
440 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace,
443 Position Pos) const = 0;
444
445 /// Inserts any necessary instructions at position \p Pos relative to
446 /// instruction \p MI to ensure previous memory instructions by this thread
447 /// with address spaces \p AddrSpace have completed and can be observed by
448 /// subsequent memory instructions by any thread executing in memory scope \p
449 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
450 /// between address spaces. Returns true iff any instructions inserted.
451 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace,
454 bool IsCrossAddrSpaceOrdering,
455 Position Pos) const = 0;
456
457 /// Handle operations that are considered non-volatile.
458 /// See \ref isNonVolatileMemoryAccess
459 virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
460
461 /// Virtual destructor to allow derivations to be deleted.
462 virtual ~SICacheControl() = default;
463};
464
465/// Generates code sequences for the memory model of all GFX targets below
466/// GFX10.
467class SIGfx6CacheControl final : public SICacheControl {
468public:
469
470 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
471
472 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
473 SIAtomicScope Scope,
474 SIAtomicAddrSpace AddrSpace) const override;
475
476 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
477 SIAtomicScope Scope,
478 SIAtomicAddrSpace AddrSpace) const override;
479
480 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
481 SIAtomicScope Scope,
482 SIAtomicAddrSpace AddrSpace) const override;
483
484 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
485 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
486 bool IsVolatile, bool IsNonTemporal,
487 bool IsLastUse) const override;
488
489 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
490 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
491 bool IsCrossAddrSpaceOrdering, Position Pos,
492 AtomicOrdering Order, bool AtomicsOnly) const override;
493
494 bool insertAcquire(MachineBasicBlock::iterator &MI,
495 SIAtomicScope Scope,
496 SIAtomicAddrSpace AddrSpace,
497 Position Pos) const override;
498
499 bool insertRelease(MachineBasicBlock::iterator &MI,
500 SIAtomicScope Scope,
501 SIAtomicAddrSpace AddrSpace,
502 bool IsCrossAddrSpaceOrdering,
503 Position Pos) const override;
504};
505
506/// Generates code sequences for the memory model of GFX10/11.
507class SIGfx10CacheControl final : public SICacheControl {
508public:
509 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
510
511 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
512 SIAtomicScope Scope,
513 SIAtomicAddrSpace AddrSpace) const override;
514
515 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
516 SIAtomicScope Scope,
517 SIAtomicAddrSpace AddrSpace) const override {
518 return false;
519 }
520
521 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
522 SIAtomicScope Scope,
523 SIAtomicAddrSpace AddrSpace) const override {
524 return false;
525 }
526
527 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
528 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
529 bool IsVolatile, bool IsNonTemporal,
530 bool IsLastUse) const override;
531
532 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
533 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
534 bool IsCrossAddrSpaceOrdering, Position Pos,
535 AtomicOrdering Order, bool AtomicsOnly) const override;
536
537 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
539
540 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
541 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
542 Position Pos) const override {
543 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
544 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
545 /*AtomicsOnly=*/false);
546 }
547};
548
549class SIGfx12CacheControl final : public SICacheControl {
550protected:
551 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
552 // \returns Returns true if \p MI is modified, false otherwise.
553 bool setTH(const MachineBasicBlock::iterator MI,
555
556 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
557 // MI. \returns Returns true if \p MI is modified, false otherwise.
558 bool setScope(const MachineBasicBlock::iterator MI,
560
561 // Stores with system scope (SCOPE_SYS) need to wait for:
562 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
563 // - non-returning-atomics - wait for STORECNT==0
564 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
565 // since it does not distinguish atomics-with-return from regular stores.
566 // There is no need to wait if memory is cached (mtype != UC).
567 bool
568 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
569
570 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
571 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
572
573public:
574 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
575 // GFX120x and GFX125x memory models greatly overlap, and in some cases
576 // the behavior is the same if assuming GFX120x in CU mode.
577 assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
578 }
579
580 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
581 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
582 bool IsCrossAddrSpaceOrdering, Position Pos,
583 AtomicOrdering Order, bool AtomicsOnly) const override;
584
585 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
586 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
587
588 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
589 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
590 bool IsVolatile, bool IsNonTemporal,
591 bool IsLastUse) const override;
592
593 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
594
595 bool handleCooperativeAtomic(MachineInstr &MI) const override;
596
597 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
598 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
599 Position Pos) const override;
600
601 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
602 SIAtomicScope Scope,
603 SIAtomicAddrSpace AddrSpace) const override {
604 return setAtomicScope(MI, Scope, AddrSpace);
605 }
606
607 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
608 SIAtomicScope Scope,
609 SIAtomicAddrSpace AddrSpace) const override {
610 return setAtomicScope(MI, Scope, AddrSpace);
611 }
612
613 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
614 SIAtomicScope Scope,
615 SIAtomicAddrSpace AddrSpace) const override {
616 return setAtomicScope(MI, Scope, AddrSpace);
617 }
618
619 bool handleNonVolatile(MachineInstr &MI) const override;
620};
621
622class SIMemoryLegalizer final {
623private:
624 const MachineModuleInfo &MMI;
625 /// Cache Control.
626 std::unique_ptr<SICacheControl> CC = nullptr;
627
628 /// List of atomic pseudo instructions.
629 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
630
631 /// Return true iff instruction \p MI is a atomic instruction that
632 /// returns a result.
633 bool isAtomicRet(const MachineInstr &MI) const {
635 }
636
637 /// Removes all processed atomic pseudo instructions from the current
638 /// function. Returns true if current function is modified, false otherwise.
639 bool removeAtomicPseudoMIs();
640
641 /// Expands load operation \p MI. Returns true if instructions are
642 /// added/deleted or \p MI is modified, false otherwise.
643 bool expandLoad(const SIMemOpInfo &MOI,
645 /// Expands store operation \p MI. Returns true if instructions are
646 /// added/deleted or \p MI is modified, false otherwise.
647 bool expandStore(const SIMemOpInfo &MOI,
649 /// Expands atomic fence operation \p MI. Returns true if
650 /// instructions are added/deleted or \p MI is modified, false otherwise.
651 bool expandAtomicFence(const SIMemOpInfo &MOI,
653 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
654 /// instructions are added/deleted or \p MI is modified, false otherwise.
655 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
657 /// Expands LDS DMA operation \p MI. Returns true if instructions are
658 /// added/deleted or \p MI is modified, false otherwise.
659 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
660
661public:
662 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
663 bool run(MachineFunction &MF);
664};
665
666class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
667public:
668 static char ID;
669
670 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
671
672 void getAnalysisUsage(AnalysisUsage &AU) const override {
673 AU.setPreservesCFG();
675 }
676
677 StringRef getPassName() const override {
678 return PASS_NAME;
679 }
680
681 bool runOnMachineFunction(MachineFunction &MF) override;
682};
683
684static const StringMap<SIAtomicAddrSpace> ASNames = {{
685 {"global", SIAtomicAddrSpace::GLOBAL},
686 {"local", SIAtomicAddrSpace::LDS},
687}};
688
689void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
690 const MachineFunction *MF = MI.getMF();
691 const Function &Fn = MF->getFunction();
693 raw_svector_ostream OS(Str);
694 OS << "unknown address space '" << AS << "'; expected one of ";
696 for (const auto &[Name, Val] : ASNames)
697 OS << LS << '\'' << Name << '\'';
698 Fn.getContext().diagnose(
699 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
700}
701
702/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
703/// If this tag isn't present, or if it has no meaningful values, returns
704/// \p none, otherwise returns the address spaces specified by the MD.
705static std::optional<SIAtomicAddrSpace>
706getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
707 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
708
709 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
710 if (!MMRA)
711 return std::nullopt;
712
713 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
714 for (const auto &[Prefix, Suffix] : MMRA) {
715 if (Prefix != FenceASPrefix)
716 continue;
717
718 if (auto It = ASNames.find(Suffix); It != ASNames.end())
719 Result |= It->second;
720 else
721 diagnoseUnknownMMRAASName(MI, Suffix);
722 }
723
724 if (Result == SIAtomicAddrSpace::NONE)
725 return std::nullopt;
726
727 return Result;
728}
729
730} // end anonymous namespace
731
732void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
733 const char *Msg) const {
734 const Function &Func = MI->getMF()->getFunction();
735 Func.getContext().diagnose(
736 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
737}
738
739std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
740SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
741 SIAtomicAddrSpace InstrAddrSpace) const {
742 if (SSID == SyncScope::System)
743 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
744 if (SSID == MMI->getAgentSSID())
745 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
746 if (SSID == MMI->getClusterSSID())
747 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
748 if (SSID == MMI->getWorkgroupSSID())
749 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
750 true);
751 if (SSID == MMI->getWavefrontSSID())
752 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
753 true);
754 if (SSID == SyncScope::SingleThread)
755 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
756 true);
757 if (SSID == MMI->getSystemOneAddressSpaceSSID())
758 return std::tuple(SIAtomicScope::SYSTEM,
759 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
760 if (SSID == MMI->getAgentOneAddressSpaceSSID())
761 return std::tuple(SIAtomicScope::AGENT,
762 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
763 if (SSID == MMI->getClusterOneAddressSpaceSSID())
764 return std::tuple(SIAtomicScope::CLUSTER,
765 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
766 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
767 return std::tuple(SIAtomicScope::WORKGROUP,
768 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
769 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
770 return std::tuple(SIAtomicScope::WAVEFRONT,
771 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
773 return std::tuple(SIAtomicScope::SINGLETHREAD,
774 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
775 return std::nullopt;
776}
777
778SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
779 if (AS == AMDGPUAS::FLAT_ADDRESS)
780 return SIAtomicAddrSpace::FLAT;
781 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
782 return SIAtomicAddrSpace::GLOBAL;
783 if (AS == AMDGPUAS::LOCAL_ADDRESS)
784 return SIAtomicAddrSpace::LDS;
786 return SIAtomicAddrSpace::SCRATCH;
787 if (AS == AMDGPUAS::REGION_ADDRESS)
788 return SIAtomicAddrSpace::GDS;
791 return SIAtomicAddrSpace::GLOBAL;
792
793 return SIAtomicAddrSpace::OTHER;
794}
795
796// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an
797// IR pass (and extending it to other scoped operations), so middle-end
798// optimizations see wavefront scope earlier.
799SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
800 const GCNSubtarget &ST, const Function &F)
801 : MMI(&MMI_), ST(ST),
802 CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
803
804std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
805 const MachineBasicBlock::iterator &MI) const {
806 assert(MI->getNumMemOperands() > 0);
807
809 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
810 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
811 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
812 bool IsNonTemporal = true;
813 bool IsVolatile = false;
814 bool IsLastUse = false;
815 bool IsCooperative = false;
816
817 // Validator should check whether or not MMOs cover the entire set of
818 // locations accessed by the memory instruction.
819 for (const auto &MMO : MI->memoperands()) {
820 IsNonTemporal &= MMO->isNonTemporal();
821 IsVolatile |= MMO->isVolatile();
822 IsLastUse |= MMO->getFlags() & MOLastUse;
823 IsCooperative |= MMO->getFlags() & MOCooperative;
824 InstrAddrSpace |=
825 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
826 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
827 if (OpOrdering != AtomicOrdering::NotAtomic) {
828 const auto &IsSyncScopeInclusion =
829 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
830 if (!IsSyncScopeInclusion) {
831 reportUnsupported(MI,
832 "Unsupported non-inclusive atomic synchronization scope");
833 return std::nullopt;
834 }
835
836 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
837 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
838 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
839 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
840 FailureOrdering =
841 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
842 }
843 }
844
845 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
846 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
847 // here, but the lowering should really be cleaned up at some point.
848 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
849 SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
850 Ordering = AtomicOrdering::Monotonic;
851
852 SIAtomicScope Scope = SIAtomicScope::NONE;
853 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
854 bool IsCrossAddressSpaceOrdering = false;
855 if (Ordering != AtomicOrdering::NotAtomic) {
856 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
857 if (!ScopeOrNone) {
858 reportUnsupported(MI, "Unsupported atomic synchronization scope");
859 return std::nullopt;
860 }
861 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
862 *ScopeOrNone;
863 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
864 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
865 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
866 reportUnsupported(MI, "Unsupported atomic address space");
867 return std::nullopt;
868 }
869 }
870 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
871 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
872 IsNonTemporal, IsLastUse, IsCooperative,
873 CanDemoteWorkgroupToWavefront);
874}
875
876std::optional<SIMemOpInfo>
877SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
878 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
879
880 if (!(MI->mayLoad() && !MI->mayStore()))
881 return std::nullopt;
882
883 // Be conservative if there are no memory operands.
884 if (MI->getNumMemOperands() == 0)
885 return SIMemOpInfo(ST);
886
887 return constructFromMIWithMMO(MI);
888}
889
890std::optional<SIMemOpInfo>
891SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
892 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
893
894 if (!(!MI->mayLoad() && MI->mayStore()))
895 return std::nullopt;
896
897 // Be conservative if there are no memory operands.
898 if (MI->getNumMemOperands() == 0)
899 return SIMemOpInfo(ST);
900
901 return constructFromMIWithMMO(MI);
902}
903
904std::optional<SIMemOpInfo>
905SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
906 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
907
908 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
909 return std::nullopt;
910
912 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
913
914 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
915 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
916 if (!ScopeOrNone) {
917 reportUnsupported(MI, "Unsupported atomic synchronization scope");
918 return std::nullopt;
919 }
920
921 SIAtomicScope Scope = SIAtomicScope::NONE;
922 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
923 bool IsCrossAddressSpaceOrdering = false;
924 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
925 *ScopeOrNone;
926
927 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
928 // We currently expect refineOrderingAS to be the only place that
929 // can refine the AS ordered by the fence.
930 // If that changes, we need to review the semantics of that function
931 // in case it needs to preserve certain address spaces.
932 reportUnsupported(MI, "Unsupported atomic address space");
933 return std::nullopt;
934 }
935
936 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
937 if (SynchronizeAS)
938 OrderingAddrSpace = *SynchronizeAS;
939
940 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
941 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
942 AtomicOrdering::NotAtomic, false, false, false, false,
943 CanDemoteWorkgroupToWavefront);
944}
945
946std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
947 const MachineBasicBlock::iterator &MI) const {
948 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
949
950 if (!(MI->mayLoad() && MI->mayStore()))
951 return std::nullopt;
952
953 // Be conservative if there are no memory operands.
954 if (MI->getNumMemOperands() == 0)
955 return SIMemOpInfo(ST);
956
957 return constructFromMIWithMMO(MI);
958}
959
960std::optional<SIMemOpInfo>
961SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
962 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
963
965 return std::nullopt;
966
967 return constructFromMIWithMMO(MI);
968}
969
970/// \returns true if \p MI has one or more MMO, and all of them are fit for
971/// being marked as non-volatile. This means that either they are accessing the
972/// constant address space, are accessing a known invariant memory location, or
973/// that they are marked with the non-volatile metadata/MMO flag.
975 if (MI.getNumMemOperands() == 0)
976 return false;
977 return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
978 return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
979 });
980}
981
982SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
983 TII = ST.getInstrInfo();
984 IV = getIsaVersion(ST.getCPU());
985 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
986}
987
988bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
989 unsigned Bits) const {
990 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
991 if (!CPol)
992 return false;
993
994 CPol->setImm(CPol->getImm() | Bits);
995 return true;
996}
997
998bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
999 assert((!ST.hasGloballyAddressableScratch() ||
1000 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
1001 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
1002 "scratch instructions should already be replaced by flat "
1003 "instructions if GloballyAddressableScratch is enabled");
1004 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
1005}
1006
1007/* static */
1008std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1009 GCNSubtarget::Generation Generation = ST.getGeneration();
1010 if (Generation < AMDGPUSubtarget::GFX10)
1011 return std::make_unique<SIGfx6CacheControl>(ST);
1012 if (Generation < AMDGPUSubtarget::GFX12)
1013 return std::make_unique<SIGfx10CacheControl>(ST);
1014 return std::make_unique<SIGfx12CacheControl>(ST);
1015}
1016
1017bool SIGfx6CacheControl::enableLoadCacheBypass(
1019 SIAtomicScope Scope,
1020 SIAtomicAddrSpace AddrSpace) const {
1021 assert(MI->mayLoad() && !MI->mayStore());
1022
1023 if (!canAffectGlobalAddrSpace(AddrSpace)) {
1024 /// The scratch address space does not need the global memory caches
1025 /// to be bypassed as all memory operations by the same thread are
1026 /// sequentially consistent, and no other thread can access scratch
1027 /// memory.
1028
1029 /// Other address spaces do not have a cache.
1030 return false;
1031 }
1032
1033 bool Changed = false;
1034 switch (Scope) {
1035 case SIAtomicScope::SYSTEM:
1036 if (ST.hasGFX940Insts()) {
1037 // Set SC bits to indicate system scope.
1038 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1039 break;
1040 }
1041 [[fallthrough]];
1042 case SIAtomicScope::AGENT:
1043 if (ST.hasGFX940Insts()) {
1044 // Set SC bits to indicate agent scope.
1045 Changed |= enableCPolBits(MI, CPol::SC1);
1046 } else {
1047 // Set L1 cache policy to MISS_EVICT.
1048 // Note: there is no L2 cache bypass policy at the ISA level.
1049 Changed |= enableCPolBits(MI, CPol::GLC);
1050 }
1051 break;
1052 case SIAtomicScope::WORKGROUP:
1053 if (ST.hasGFX940Insts()) {
1054 // In threadgroup split mode the waves of a work-group can be executing
1055 // on different CUs. Therefore need to bypass the L1 which is per CU.
1056 // Otherwise in non-threadgroup split mode all waves of a work-group are
1057 // on the same CU, and so the L1 does not need to be bypassed. Setting
1058 // SC bits to indicate work-group scope will do this automatically.
1059 Changed |= enableCPolBits(MI, CPol::SC0);
1060 } else if (ST.hasGFX90AInsts()) {
1061 // In threadgroup split mode the waves of a work-group can be executing
1062 // on different CUs. Therefore need to bypass the L1 which is per CU.
1063 // Otherwise in non-threadgroup split mode all waves of a work-group are
1064 // on the same CU, and so the L1 does not need to be bypassed.
1065 if (ST.isTgSplitEnabled())
1066 Changed |= enableCPolBits(MI, CPol::GLC);
1067 }
1068 break;
1069 case SIAtomicScope::WAVEFRONT:
1070 case SIAtomicScope::SINGLETHREAD:
1071 // No cache to bypass.
1072 break;
1073 default:
1074 llvm_unreachable("Unsupported synchronization scope");
1075 }
1076
1077 return Changed;
1078}
1079
1080bool SIGfx6CacheControl::enableStoreCacheBypass(
1082 SIAtomicScope Scope,
1083 SIAtomicAddrSpace AddrSpace) const {
1084 assert(!MI->mayLoad() && MI->mayStore());
1085 bool Changed = false;
1086
1087 /// For targets other than GFX940, the L1 cache is write through so does not
1088 /// need to be bypassed. There is no bypass control for the L2 cache at the
1089 /// isa level.
1090
1091 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1092 switch (Scope) {
1093 case SIAtomicScope::SYSTEM:
1094 // Set SC bits to indicate system scope.
1095 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1096 break;
1097 case SIAtomicScope::AGENT:
1098 // Set SC bits to indicate agent scope.
1099 Changed |= enableCPolBits(MI, CPol::SC1);
1100 break;
1101 case SIAtomicScope::WORKGROUP:
1102 // Set SC bits to indicate workgroup scope.
1103 Changed |= enableCPolBits(MI, CPol::SC0);
1104 break;
1105 case SIAtomicScope::WAVEFRONT:
1106 case SIAtomicScope::SINGLETHREAD:
1107 // Leave SC bits unset to indicate wavefront scope.
1108 break;
1109 default:
1110 llvm_unreachable("Unsupported synchronization scope");
1111 }
1112
1113 /// The scratch address space does not need the global memory caches
1114 /// to be bypassed as all memory operations by the same thread are
1115 /// sequentially consistent, and no other thread can access scratch
1116 /// memory.
1117
1118 /// Other address spaces do not have a cache.
1119 }
1120
1121 return Changed;
1122}
1123
1124bool SIGfx6CacheControl::enableRMWCacheBypass(
1126 SIAtomicScope Scope,
1127 SIAtomicAddrSpace AddrSpace) const {
1128 assert(MI->mayLoad() && MI->mayStore());
1129 bool Changed = false;
1130
1131 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1132 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1133 /// indicate if they are return or no-return. Note: there is no L2 cache
1134 /// coherent bypass control at the ISA level.
1135 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1136
1137 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1138 switch (Scope) {
1139 case SIAtomicScope::SYSTEM:
1140 // Set SC1 bit to indicate system scope.
1141 Changed |= enableCPolBits(MI, CPol::SC1);
1142 break;
1143 case SIAtomicScope::AGENT:
1144 case SIAtomicScope::WORKGROUP:
1145 case SIAtomicScope::WAVEFRONT:
1146 case SIAtomicScope::SINGLETHREAD:
1147 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1148 // to indicate system or agent scope. The SC0 bit is used to indicate if
1149 // they are return or no-return. Leave SC1 bit unset to indicate agent
1150 // scope.
1151 break;
1152 default:
1153 llvm_unreachable("Unsupported synchronization scope");
1154 }
1155 }
1156
1157 return Changed;
1158}
1159
1160bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1161 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1162 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1163 // Only handle load and store, not atomic read-modify-write insructions. The
1164 // latter use glc to indicate if the atomic returns a result and so must not
1165 // be used for cache control.
1166 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1167
1168 // Only update load and store, not LLVM IR atomic read-modify-write
1169 // instructions. The latter are always marked as volatile so cannot sensibly
1170 // handle it as do not want to pessimize all atomics. Also they do not support
1171 // the nontemporal attribute.
1172 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1173
1174 bool Changed = false;
1175
1176 if (IsVolatile) {
1177 if (ST.hasGFX940Insts()) {
1178 // Set SC bits to indicate system scope.
1179 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1180 } else if (Op == SIMemOp::LOAD) {
1181 // Set L1 cache policy to be MISS_EVICT for load instructions
1182 // and MISS_LRU for store instructions.
1183 // Note: there is no L2 cache bypass policy at the ISA level.
1184 Changed |= enableCPolBits(MI, CPol::GLC);
1185 }
1186
1187 // Ensure operation has completed at system scope to cause all volatile
1188 // operations to be visible outside the program in a global order. Do not
1189 // request cross address space as only the global address space can be
1190 // observable outside the program, so no need to cause a waitcnt for LDS
1191 // address space operations.
1192 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1193 Position::AFTER, AtomicOrdering::Unordered,
1194 /*AtomicsOnly=*/false);
1195
1196 return Changed;
1197 }
1198
1199 if (IsNonTemporal) {
1200 if (ST.hasGFX940Insts()) {
1201 Changed |= enableCPolBits(MI, CPol::NT);
1202 } else {
1203 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1204 // for both loads and stores, and the L2 cache policy to STREAM.
1205 Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
1206 }
1207 return Changed;
1208 }
1209
1210 return Changed;
1211}
1212
1213bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1214 SIAtomicScope Scope,
1215 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1216 bool IsCrossAddrSpaceOrdering, Position Pos,
1217 AtomicOrdering Order,
1218 bool AtomicsOnly) const {
1219 bool Changed = false;
1220
1221 MachineBasicBlock &MBB = *MI->getParent();
1222 const DebugLoc &DL = MI->getDebugLoc();
1223
1224 if (Pos == Position::AFTER)
1225 ++MI;
1226
1227 // GFX90A+
1228 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1229 // In threadgroup split mode the waves of a work-group can be executing on
1230 // different CUs. Therefore need to wait for global or GDS memory operations
1231 // to complete to ensure they are visible to waves in the other CUs.
1232 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1233 // the same CU, so no need to wait for global memory as all waves in the
1234 // work-group access the same the L1, nor wait for GDS as access are ordered
1235 // on a CU.
1236 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1237 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1238 (Scope == SIAtomicScope::WORKGROUP)) {
1239 // Same as <GFX90A at AGENT scope;
1240 Scope = SIAtomicScope::AGENT;
1241 }
1242 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1243 // LDS memory operations.
1244 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1245 }
1246
1247 bool VMCnt = false;
1248 bool LGKMCnt = false;
1249
1250 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1251 SIAtomicAddrSpace::NONE) {
1252 switch (Scope) {
1253 case SIAtomicScope::SYSTEM:
1254 case SIAtomicScope::AGENT:
1255 VMCnt |= true;
1256 break;
1257 case SIAtomicScope::WORKGROUP:
1258 case SIAtomicScope::WAVEFRONT:
1259 case SIAtomicScope::SINGLETHREAD:
1260 // The L1 cache keeps all memory operations in order for
1261 // wavefronts in the same work-group.
1262 break;
1263 default:
1264 llvm_unreachable("Unsupported synchronization scope");
1265 }
1266 }
1267
1268 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1269 switch (Scope) {
1270 case SIAtomicScope::SYSTEM:
1271 case SIAtomicScope::AGENT:
1272 case SIAtomicScope::WORKGROUP:
1273 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1274 // not needed as LDS operations for all waves are executed in a total
1275 // global ordering as observed by all waves. Required if also
1276 // synchronizing with global/GDS memory as LDS operations could be
1277 // reordered with respect to later global/GDS memory operations of the
1278 // same wave.
1279 LGKMCnt |= IsCrossAddrSpaceOrdering;
1280 break;
1281 case SIAtomicScope::WAVEFRONT:
1282 case SIAtomicScope::SINGLETHREAD:
1283 // The LDS keeps all memory operations in order for
1284 // the same wavefront.
1285 break;
1286 default:
1287 llvm_unreachable("Unsupported synchronization scope");
1288 }
1289 }
1290
1291 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1292 switch (Scope) {
1293 case SIAtomicScope::SYSTEM:
1294 case SIAtomicScope::AGENT:
1295 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1296 // is not needed as GDS operations for all waves are executed in a total
1297 // global ordering as observed by all waves. Required if also
1298 // synchronizing with global/LDS memory as GDS operations could be
1299 // reordered with respect to later global/LDS memory operations of the
1300 // same wave.
1301 LGKMCnt |= IsCrossAddrSpaceOrdering;
1302 break;
1303 case SIAtomicScope::WORKGROUP:
1304 case SIAtomicScope::WAVEFRONT:
1305 case SIAtomicScope::SINGLETHREAD:
1306 // The GDS keeps all memory operations in order for
1307 // the same work-group.
1308 break;
1309 default:
1310 llvm_unreachable("Unsupported synchronization scope");
1311 }
1312 }
1313
1314 if (VMCnt || LGKMCnt) {
1315 unsigned WaitCntImmediate =
1317 VMCnt ? 0 : getVmcntBitMask(IV),
1319 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1320 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1321 .addImm(WaitCntImmediate);
1322 Changed = true;
1323 }
1324
1325 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1326 // at workgroup-scoped release operations that specify the LDS address space.
1327 // SIInsertWaitcnts will later replace this with a vmcnt().
1328 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1329 Scope == SIAtomicScope::WORKGROUP &&
1330 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1332 Changed = true;
1333 }
1334
1335 if (Pos == Position::AFTER)
1336 --MI;
1337
1338 return Changed;
1339}
1340
1342 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1343 return false;
1344 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1345}
1346
1347bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1348 SIAtomicScope Scope,
1349 SIAtomicAddrSpace AddrSpace,
1350 Position Pos) const {
1351 if (!InsertCacheInv)
1352 return false;
1353
1354 bool Changed = false;
1355
1356 MachineBasicBlock &MBB = *MI->getParent();
1357 const DebugLoc &DL = MI->getDebugLoc();
1358
1359 if (Pos == Position::AFTER)
1360 ++MI;
1361
1362 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1363 ? AMDGPU::BUFFER_WBINVL1_VOL
1364 : AMDGPU::BUFFER_WBINVL1;
1365
1366 if (canAffectGlobalAddrSpace(AddrSpace)) {
1367 switch (Scope) {
1368 case SIAtomicScope::SYSTEM:
1369 if (ST.hasGFX940Insts()) {
1370 // Ensures that following loads will not see stale remote VMEM data or
1371 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1372 // and CC will never be stale due to the local memory probes.
1373 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1374 // Set SC bits to indicate system scope.
1376 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1377 // hardware does not reorder memory operations by the same wave with
1378 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1379 // remove any cache lines of earlier writes by the same wave and ensures
1380 // later reads by the same wave will refetch the cache lines.
1381 Changed = true;
1382 break;
1383 }
1384
1385 if (ST.hasGFX90AInsts()) {
1386 // Ensures that following loads will not see stale remote VMEM data or
1387 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1388 // and CC will never be stale due to the local memory probes.
1389 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1390 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1391 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1392 // hardware does not reorder memory operations by the same wave with
1393 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1394 // to remove any cache lines of earlier writes by the same wave and
1395 // ensures later reads by the same wave will refetch the cache lines.
1396 Changed = true;
1397 break;
1398 }
1399 [[fallthrough]];
1400 case SIAtomicScope::AGENT:
1401 if (ST.hasGFX940Insts()) {
1402 // Ensures that following loads will not see stale remote date or local
1403 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1404 // stale due to the memory probes.
1405 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1406 // Set SC bits to indicate agent scope.
1408 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1409 // does not reorder memory operations with respect to preceeding buffer
1410 // invalidate. The invalidate is guaranteed to remove any cache lines of
1411 // earlier writes and ensures later writes will refetch the cache lines.
1412 } else
1413 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1414 Changed = true;
1415 break;
1416 case SIAtomicScope::WORKGROUP:
1417 if (ST.isTgSplitEnabled()) {
1418 if (ST.hasGFX940Insts()) {
1419 // In threadgroup split mode the waves of a work-group can be
1420 // executing on different CUs. Therefore need to invalidate the L1
1421 // which is per CU. Otherwise in non-threadgroup split mode all waves
1422 // of a work-group are on the same CU, and so the L1 does not need to
1423 // be invalidated.
1424
1425 // Ensures L1 is invalidated if in threadgroup split mode. In
1426 // non-threadgroup split mode it is a NOP, but no point generating it
1427 // in that case if know not in that mode.
1428 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1429 // Set SC bits to indicate work-group scope.
1431 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1432 // does not reorder memory operations with respect to preceeding
1433 // buffer invalidate. The invalidate is guaranteed to remove any cache
1434 // lines of earlier writes and ensures later writes will refetch the
1435 // cache lines.
1436 Changed = true;
1437 } else if (ST.hasGFX90AInsts()) {
1438 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1439 Changed = true;
1440 }
1441 }
1442 break;
1443 case SIAtomicScope::WAVEFRONT:
1444 case SIAtomicScope::SINGLETHREAD:
1445 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1446 // there are no caches to invalidate. All other targets have no cache to
1447 // invalidate.
1448 break;
1449 default:
1450 llvm_unreachable("Unsupported synchronization scope");
1451 }
1452 }
1453
1454 /// The scratch address space does not need the global memory cache
1455 /// to be flushed as all memory operations by the same thread are
1456 /// sequentially consistent, and no other thread can access scratch
1457 /// memory.
1458
1459 /// Other address spaces do not have a cache.
1460
1461 if (Pos == Position::AFTER)
1462 --MI;
1463
1464 return Changed;
1465}
1466
1467bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1468 SIAtomicScope Scope,
1469 SIAtomicAddrSpace AddrSpace,
1470 bool IsCrossAddrSpaceOrdering,
1471 Position Pos) const {
1472 bool Changed = false;
1473
1474 if (ST.hasGFX90AInsts()) {
1475 MachineBasicBlock &MBB = *MI->getParent();
1476 const DebugLoc &DL = MI->getDebugLoc();
1477
1478 if (Pos == Position::AFTER)
1479 ++MI;
1480
1481 if (canAffectGlobalAddrSpace(AddrSpace)) {
1482 switch (Scope) {
1483 case SIAtomicScope::SYSTEM:
1484 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1485 // hardware does not reorder memory operations by the same wave with
1486 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1487 // to initiate writeback of any dirty cache lines of earlier writes by
1488 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1489 // writeback has completed.
1490 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1491 // Set SC bits to indicate system scope.
1493 Changed = true;
1494 break;
1495 case SIAtomicScope::AGENT:
1496 if (ST.hasGFX940Insts()) {
1497 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1498 // Set SC bits to indicate agent scope.
1500
1501 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1502 // SIAtomicScope::AGENT, the following insertWait will generate the
1503 // required "S_WAITCNT vmcnt(0)".
1504 Changed = true;
1505 }
1506 break;
1507 case SIAtomicScope::WORKGROUP:
1508 case SIAtomicScope::WAVEFRONT:
1509 case SIAtomicScope::SINGLETHREAD:
1510 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1511 // would writeback, and would require an otherwise unnecessary
1512 // "S_WAITCNT vmcnt(0)".
1513 break;
1514 default:
1515 llvm_unreachable("Unsupported synchronization scope");
1516 }
1517 }
1518
1519 if (Pos == Position::AFTER)
1520 --MI;
1521 }
1522
1523 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1524 // S_WAITCNT needed.
1525 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1526 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1527 /*AtomicsOnly=*/false);
1528
1529 return Changed;
1530}
1531
1532bool SIGfx10CacheControl::enableLoadCacheBypass(
1533 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1534 SIAtomicAddrSpace AddrSpace) const {
1535 assert(MI->mayLoad() && !MI->mayStore());
1536 bool Changed = false;
1537
1538 if (canAffectGlobalAddrSpace(AddrSpace)) {
1539 switch (Scope) {
1540 case SIAtomicScope::SYSTEM:
1541 case SIAtomicScope::AGENT:
1542 // Set the L0 and L1 cache policies to MISS_EVICT.
1543 // Note: there is no L2 cache coherent bypass control at the ISA level.
1544 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1545 Changed |=
1546 enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
1547 break;
1548 case SIAtomicScope::WORKGROUP:
1549 // In WGP mode the waves of a work-group can be executing on either CU of
1550 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1551 // CU mode all waves of a work-group are on the same CU, and so the L0
1552 // does not need to be bypassed.
1553 if (!ST.isCuModeEnabled())
1554 Changed |= enableCPolBits(MI, CPol::GLC);
1555 break;
1556 case SIAtomicScope::WAVEFRONT:
1557 case SIAtomicScope::SINGLETHREAD:
1558 // No cache to bypass.
1559 break;
1560 default:
1561 llvm_unreachable("Unsupported synchronization scope");
1562 }
1563 }
1564
1565 /// The scratch address space does not need the global memory caches
1566 /// to be bypassed as all memory operations by the same thread are
1567 /// sequentially consistent, and no other thread can access scratch
1568 /// memory.
1569
1570 /// Other address spaces do not have a cache.
1571
1572 return Changed;
1573}
1574
1575bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1576 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1577 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1578
1579 // Only handle load and store, not atomic read-modify-write insructions. The
1580 // latter use glc to indicate if the atomic returns a result and so must not
1581 // be used for cache control.
1582 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1583
1584 // Only update load and store, not LLVM IR atomic read-modify-write
1585 // instructions. The latter are always marked as volatile so cannot sensibly
1586 // handle it as do not want to pessimize all atomics. Also they do not support
1587 // the nontemporal attribute.
1588 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1589
1590 bool Changed = false;
1591
1592 if (IsVolatile) {
1593 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1594 // and MISS_LRU for store instructions.
1595 // Note: there is no L2 cache coherent bypass control at the ISA level.
1596 if (Op == SIMemOp::LOAD) {
1597 Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1598 }
1599
1600 // GFX11: Set MALL NOALLOC for both load and store instructions.
1601 if (AMDGPU::isGFX11(ST))
1602 Changed |= enableCPolBits(MI, CPol::DLC);
1603
1604 // Ensure operation has completed at system scope to cause all volatile
1605 // operations to be visible outside the program in a global order. Do not
1606 // request cross address space as only the global address space can be
1607 // observable outside the program, so no need to cause a waitcnt for LDS
1608 // address space operations.
1609 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1610 Position::AFTER, AtomicOrdering::Unordered,
1611 /*AtomicsOnly=*/false);
1612 return Changed;
1613 }
1614
1615 if (IsNonTemporal) {
1616 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1617 // and L2 cache policy to STREAM.
1618 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1619 // to MISS_EVICT and the L2 cache policy to STREAM.
1620 if (Op == SIMemOp::STORE)
1621 Changed |= enableCPolBits(MI, CPol::GLC);
1622 Changed |= enableCPolBits(MI, CPol::SLC);
1623
1624 // GFX11: Set MALL NOALLOC for both load and store instructions.
1625 if (AMDGPU::isGFX11(ST))
1626 Changed |= enableCPolBits(MI, CPol::DLC);
1627
1628 return Changed;
1629 }
1630
1631 return Changed;
1632}
1633
1634bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1635 SIAtomicScope Scope,
1636 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1637 bool IsCrossAddrSpaceOrdering,
1638 Position Pos, AtomicOrdering Order,
1639 bool AtomicsOnly) const {
1640 bool Changed = false;
1641
1642 MachineBasicBlock &MBB = *MI->getParent();
1643 const DebugLoc &DL = MI->getDebugLoc();
1644
1645 if (Pos == Position::AFTER)
1646 ++MI;
1647
1648 bool VMCnt = false;
1649 bool VSCnt = false;
1650 bool LGKMCnt = false;
1651
1652 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1653 SIAtomicAddrSpace::NONE) {
1654 switch (Scope) {
1655 case SIAtomicScope::SYSTEM:
1656 case SIAtomicScope::AGENT:
1657 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1658 VMCnt |= true;
1659 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1660 VSCnt |= true;
1661 break;
1662 case SIAtomicScope::WORKGROUP:
1663 // In WGP mode the waves of a work-group can be executing on either CU of
1664 // the WGP. Therefore need to wait for operations to complete to ensure
1665 // they are visible to waves in the other CU as the L0 is per CU.
1666 // Otherwise in CU mode and all waves of a work-group are on the same CU
1667 // which shares the same L0. Note that we still need to wait when
1668 // performing a release in this mode to respect the transitivity of
1669 // happens-before, e.g. other waves of the workgroup must be able to
1670 // release the memory from another wave at a wider scope.
1671 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
1672 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1673 VMCnt |= true;
1674 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1675 VSCnt |= true;
1676 }
1677 break;
1678 case SIAtomicScope::WAVEFRONT:
1679 case SIAtomicScope::SINGLETHREAD:
1680 // The L0 cache keeps all memory operations in order for
1681 // work-items in the same wavefront.
1682 break;
1683 default:
1684 llvm_unreachable("Unsupported synchronization scope");
1685 }
1686 }
1687
1688 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1689 switch (Scope) {
1690 case SIAtomicScope::SYSTEM:
1691 case SIAtomicScope::AGENT:
1692 case SIAtomicScope::WORKGROUP:
1693 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1694 // not needed as LDS operations for all waves are executed in a total
1695 // global ordering as observed by all waves. Required if also
1696 // synchronizing with global/GDS memory as LDS operations could be
1697 // reordered with respect to later global/GDS memory operations of the
1698 // same wave.
1699 LGKMCnt |= IsCrossAddrSpaceOrdering;
1700 break;
1701 case SIAtomicScope::WAVEFRONT:
1702 case SIAtomicScope::SINGLETHREAD:
1703 // The LDS keeps all memory operations in order for
1704 // the same wavefront.
1705 break;
1706 default:
1707 llvm_unreachable("Unsupported synchronization scope");
1708 }
1709 }
1710
1711 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1712 switch (Scope) {
1713 case SIAtomicScope::SYSTEM:
1714 case SIAtomicScope::AGENT:
1715 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1716 // is not needed as GDS operations for all waves are executed in a total
1717 // global ordering as observed by all waves. Required if also
1718 // synchronizing with global/LDS memory as GDS operations could be
1719 // reordered with respect to later global/LDS memory operations of the
1720 // same wave.
1721 LGKMCnt |= IsCrossAddrSpaceOrdering;
1722 break;
1723 case SIAtomicScope::WORKGROUP:
1724 case SIAtomicScope::WAVEFRONT:
1725 case SIAtomicScope::SINGLETHREAD:
1726 // The GDS keeps all memory operations in order for
1727 // the same work-group.
1728 break;
1729 default:
1730 llvm_unreachable("Unsupported synchronization scope");
1731 }
1732 }
1733
1734 if (VMCnt || LGKMCnt) {
1735 unsigned WaitCntImmediate =
1737 VMCnt ? 0 : getVmcntBitMask(IV),
1739 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1740 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1741 .addImm(WaitCntImmediate);
1742 Changed = true;
1743 }
1744
1745 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1746 // at workgroup-scoped release operations that specify the LDS address space.
1747 // SIInsertWaitcnts will later replace this with a vmcnt().
1748 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1749 Scope == SIAtomicScope::WORKGROUP &&
1750 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1751 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1752 Changed = true;
1753 }
1754
1755 if (VSCnt) {
1756 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1757 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1758 .addImm(0);
1759 Changed = true;
1760 }
1761
1762 if (Pos == Position::AFTER)
1763 --MI;
1764
1765 return Changed;
1766}
1767
1768bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1769 SIAtomicScope Scope,
1770 SIAtomicAddrSpace AddrSpace,
1771 Position Pos) const {
1772 if (!InsertCacheInv)
1773 return false;
1774
1775 bool Changed = false;
1776
1777 MachineBasicBlock &MBB = *MI->getParent();
1778 const DebugLoc &DL = MI->getDebugLoc();
1779
1780 if (Pos == Position::AFTER)
1781 ++MI;
1782
1783 if (canAffectGlobalAddrSpace(AddrSpace)) {
1784 switch (Scope) {
1785 case SIAtomicScope::SYSTEM:
1786 case SIAtomicScope::AGENT:
1787 // The order of invalidates matter here. We must invalidate "outer in"
1788 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1789 // invalidated.
1790 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1791 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1792 Changed = true;
1793 break;
1794 case SIAtomicScope::WORKGROUP:
1795 // In WGP mode the waves of a work-group can be executing on either CU of
1796 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1797 // in CU mode and all waves of a work-group are on the same CU, and so the
1798 // L0 does not need to be invalidated.
1799 if (!ST.isCuModeEnabled()) {
1800 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1801 Changed = true;
1802 }
1803 break;
1804 case SIAtomicScope::WAVEFRONT:
1805 case SIAtomicScope::SINGLETHREAD:
1806 // No cache to invalidate.
1807 break;
1808 default:
1809 llvm_unreachable("Unsupported synchronization scope");
1810 }
1811 }
1812
1813 /// The scratch address space does not need the global memory cache
1814 /// to be flushed as all memory operations by the same thread are
1815 /// sequentially consistent, and no other thread can access scratch
1816 /// memory.
1817
1818 /// Other address spaces do not have a cache.
1819
1820 if (Pos == Position::AFTER)
1821 --MI;
1822
1823 return Changed;
1824}
1825
1826bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1827 AMDGPU::CPol::CPol Value) const {
1828 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1829 if (!CPol)
1830 return false;
1831
1832 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1833 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1834 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1835 return true;
1836 }
1837
1838 return false;
1839}
1840
1841bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1842 AMDGPU::CPol::CPol Value) const {
1843 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1844 if (!CPol)
1845 return false;
1846
1847 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1848 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1849 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1850 return true;
1851 }
1852
1853 return false;
1854}
1855
1856bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1857 const MachineBasicBlock::iterator MI) const {
1858 // TODO: implement flag for frontend to give us a hint not to insert waits.
1859
1860 MachineBasicBlock &MBB = *MI->getParent();
1861 const DebugLoc &DL = MI->getDebugLoc();
1862
1863 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
1864 if (ST.hasImageInsts()) {
1865 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
1866 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
1867 }
1868 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
1869 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
1870
1871 return true;
1872}
1873
1874bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1875 SIAtomicScope Scope,
1876 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1877 bool IsCrossAddrSpaceOrdering,
1878 Position Pos, AtomicOrdering Order,
1879 bool AtomicsOnly) const {
1880 bool Changed = false;
1881
1882 MachineBasicBlock &MBB = *MI->getParent();
1883 const DebugLoc &DL = MI->getDebugLoc();
1884
1885 bool LOADCnt = false;
1886 bool DSCnt = false;
1887 bool STORECnt = false;
1888
1889 if (Pos == Position::AFTER)
1890 ++MI;
1891
1892 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1893 SIAtomicAddrSpace::NONE) {
1894 switch (Scope) {
1895 case SIAtomicScope::SYSTEM:
1896 case SIAtomicScope::AGENT:
1897 case SIAtomicScope::CLUSTER:
1898 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1899 LOADCnt |= true;
1900 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1901 STORECnt |= true;
1902 break;
1903 case SIAtomicScope::WORKGROUP:
1904 // GFX12.0:
1905 // In WGP mode the waves of a work-group can be executing on either CU
1906 // of the WGP. Therefore need to wait for operations to complete to
1907 // ensure they are visible to waves in the other CU as the L0 is per CU.
1908 //
1909 // Otherwise in CU mode and all waves of a work-group are on the same CU
1910 // which shares the same L0. Note that we still need to wait when
1911 // performing a release in this mode to respect the transitivity of
1912 // happens-before, e.g. other waves of the workgroup must be able to
1913 // release the memory from another wave at a wider scope.
1914 //
1915 // GFX12.5:
1916 // CU$ has two ports. To ensure operations are visible at the workgroup
1917 // level, we need to ensure all operations in this port have completed
1918 // so the other SIMDs in the WG can see them. There is no ordering
1919 // guarantee between the ports.
1920 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1921 isReleaseOrStronger(Order)) {
1922 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1923 LOADCnt |= true;
1924 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1925 STORECnt |= true;
1926 }
1927 break;
1928 case SIAtomicScope::WAVEFRONT:
1929 case SIAtomicScope::SINGLETHREAD:
1930 // The L0 cache keeps all memory operations in order for
1931 // work-items in the same wavefront.
1932 break;
1933 default:
1934 llvm_unreachable("Unsupported synchronization scope");
1935 }
1936 }
1937
1938 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1939 switch (Scope) {
1940 case SIAtomicScope::SYSTEM:
1941 case SIAtomicScope::AGENT:
1942 case SIAtomicScope::CLUSTER:
1943 case SIAtomicScope::WORKGROUP:
1944 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1945 // not needed as LDS operations for all waves are executed in a total
1946 // global ordering as observed by all waves. Required if also
1947 // synchronizing with global/GDS memory as LDS operations could be
1948 // reordered with respect to later global/GDS memory operations of the
1949 // same wave.
1950 DSCnt |= IsCrossAddrSpaceOrdering;
1951 break;
1952 case SIAtomicScope::WAVEFRONT:
1953 case SIAtomicScope::SINGLETHREAD:
1954 // The LDS keeps all memory operations in order for
1955 // the same wavefront.
1956 break;
1957 default:
1958 llvm_unreachable("Unsupported synchronization scope");
1959 }
1960 }
1961
1962 if (LOADCnt) {
1963 // Acquire sequences only need to wait on the previous atomic operation.
1964 // e.g. a typical sequence looks like
1965 // atomic load
1966 // (wait)
1967 // global_inv
1968 //
1969 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1970 // to be tracked using loadcnt.
1971 //
1972 // This also applies to fences. Fences cannot pair with an instruction
1973 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1974 if (!AtomicsOnly && ST.hasImageInsts()) {
1975 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
1976 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
1977 }
1978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
1979 Changed = true;
1980 }
1981
1982 if (STORECnt) {
1983 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
1984 Changed = true;
1985 }
1986
1987 if (DSCnt) {
1988 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
1989 Changed = true;
1990 }
1991
1992 if (Pos == Position::AFTER)
1993 --MI;
1994
1995 return Changed;
1996}
1997
1998bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1999 SIAtomicScope Scope,
2000 SIAtomicAddrSpace AddrSpace,
2001 Position Pos) const {
2002 if (!InsertCacheInv)
2003 return false;
2004
2005 MachineBasicBlock &MBB = *MI->getParent();
2006 const DebugLoc &DL = MI->getDebugLoc();
2007
2008 /// The scratch address space does not need the global memory cache
2009 /// to be flushed as all memory operations by the same thread are
2010 /// sequentially consistent, and no other thread can access scratch
2011 /// memory.
2012
2013 /// Other address spaces do not have a cache.
2014 if (!canAffectGlobalAddrSpace(AddrSpace))
2015 return false;
2016
2018 switch (Scope) {
2019 case SIAtomicScope::SYSTEM:
2020 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2021 break;
2022 case SIAtomicScope::AGENT:
2023 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2024 break;
2025 case SIAtomicScope::CLUSTER:
2026 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2027 break;
2028 case SIAtomicScope::WORKGROUP:
2029 // GFX12.0:
2030 // In WGP mode the waves of a work-group can be executing on either CU of
2031 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2032 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2033 // so the L0 does not need to be invalidated.
2034 //
2035 // GFX12.5 has a shared WGP$, so no invalidates are required.
2036 if (ST.isCuModeEnabled())
2037 return false;
2038
2039 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2040 break;
2041 case SIAtomicScope::WAVEFRONT:
2042 case SIAtomicScope::SINGLETHREAD:
2043 // No cache to invalidate.
2044 return false;
2045 default:
2046 llvm_unreachable("Unsupported synchronization scope");
2047 }
2048
2049 if (Pos == Position::AFTER)
2050 ++MI;
2051
2052 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2053
2054 if (Pos == Position::AFTER)
2055 --MI;
2056
2057 // Target requires a waitcnt to ensure that the proceeding INV has completed
2058 // as it may get reorded with following load instructions.
2059 if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
2060 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD,
2061 /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire,
2062 /*AtomicsOnly=*/false);
2063
2064 if (Pos == Position::AFTER)
2065 --MI;
2066 }
2067
2068 return true;
2069}
2070
2071bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2072 SIAtomicScope Scope,
2073 SIAtomicAddrSpace AddrSpace,
2074 bool IsCrossAddrSpaceOrdering,
2075 Position Pos) const {
2076 bool Changed = false;
2077
2078 MachineBasicBlock &MBB = *MI->getParent();
2079 const DebugLoc &DL = MI->getDebugLoc();
2080
2081 // The scratch address space does not need the global memory cache
2082 // writeback as all memory operations by the same thread are
2083 // sequentially consistent, and no other thread can access scratch
2084 // memory.
2085 if (canAffectGlobalAddrSpace(AddrSpace)) {
2086 if (Pos == Position::AFTER)
2087 ++MI;
2088
2089 // global_wb is only necessary at system scope for GFX12.0,
2090 // they're also necessary at device scope for GFX12.5 as stores
2091 // cannot report completion earlier than L2.
2092 //
2093 // Emitting it for lower scopes is a slow no-op, so we omit it
2094 // for performance.
2095 std::optional<AMDGPU::CPol::CPol> NeedsWB;
2096 switch (Scope) {
2097 case SIAtomicScope::SYSTEM:
2098 NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2099 break;
2100 case SIAtomicScope::AGENT:
2101 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2102 if (ST.hasGFX1250Insts())
2103 NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2104 break;
2105 case SIAtomicScope::CLUSTER:
2106 case SIAtomicScope::WORKGROUP:
2107 // No WB necessary, but we still have to wait.
2108 case SIAtomicScope::WAVEFRONT:
2109 case SIAtomicScope::SINGLETHREAD:
2110 // No WB or wait necessary here, but insertWait takes care of that.
2111 break;
2112 default:
2113 llvm_unreachable("Unsupported synchronization scope");
2114 }
2115
2116 if (NeedsWB) {
2117 // Target requires a waitcnt to ensure that the proceeding store
2118 // proceeding store/rmw operations have completed in L2 so their data will
2119 // be written back by the WB instruction.
2120 if (ST.hasINVWBL2WaitCntRequirement())
2121 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2122 /*IsCrossAddrSpaceOrdering=*/false, Pos,
2123 AtomicOrdering::Release,
2124 /*AtomicsOnly=*/false);
2125
2126 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
2127 Changed = true;
2128 }
2129
2130 if (Pos == Position::AFTER)
2131 --MI;
2132 }
2133
2134 // We always have to wait for previous memory operations (load/store) to
2135 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2136 // we of course need to wait for that as well.
2137 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2138 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2139 /*AtomicsOnly=*/false);
2140
2141 return Changed;
2142}
2143
2144bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
2145 // On GFX12.5, set the NV CPol bit.
2146 if (!ST.hasGFX1250Insts())
2147 return false;
2148 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2149 if (!CPol)
2150 return false;
2151 CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
2152 return true;
2153}
2154
2155bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2156 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2157 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2158
2159 // Only handle load and store, not atomic read-modify-write instructions.
2160 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2161
2162 // Only update load and store, not LLVM IR atomic read-modify-write
2163 // instructions. The latter are always marked as volatile so cannot sensibly
2164 // handle it as do not want to pessimize all atomics. Also they do not support
2165 // the nontemporal attribute.
2166 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2167
2168 bool Changed = false;
2169
2170 if (IsLastUse) {
2171 // Set last-use hint.
2172 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2173 } else if (IsNonTemporal) {
2174 // Set non-temporal hint for all cache levels.
2175 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2176 }
2177
2178 if (IsVolatile) {
2179 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2180
2181 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2183 MachineBasicBlock &MBB = *MI->getParent();
2184 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2185 Changed = true;
2186 }
2187
2188 // Ensure operation has completed at system scope to cause all volatile
2189 // operations to be visible outside the program in a global order. Do not
2190 // request cross address space as only the global address space can be
2191 // observable outside the program, so no need to cause a waitcnt for LDS
2192 // address space operations.
2193 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2194 Position::AFTER, AtomicOrdering::Unordered,
2195 /*AtomicsOnly=*/false);
2196 }
2197
2198 return Changed;
2199}
2200
2201bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2202 assert(MI.mayStore() && "Not a Store inst");
2203 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2204 bool Changed = false;
2205
2206 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2208 MachineBasicBlock &MBB = *MI.getParent();
2209 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2210 Changed = true;
2211 }
2212
2213 // Remaining fixes do not apply to RMWs.
2214 if (IsRMW)
2215 return Changed;
2216
2217 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2218 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2219 return Changed;
2220 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2221
2222 // GFX12.0 only: Extra waits needed before system scope stores.
2223 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2224 Scope == CPol::SCOPE_SYS)
2225 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2226
2227 return Changed;
2228}
2229
2230bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2231 if (!ST.hasGFX1250Insts())
2232 return false;
2233
2234 // Cooperative atomics need to be SCOPE_DEV or higher.
2235 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2236 assert(CPol && "No CPol operand?");
2237 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2238 if (Scope < CPol::SCOPE_DEV)
2239 return setScope(MI, CPol::SCOPE_DEV);
2240 return false;
2241}
2242
2243bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2244 SIAtomicScope Scope,
2245 SIAtomicAddrSpace AddrSpace) const {
2246 bool Changed = false;
2247
2248 if (canAffectGlobalAddrSpace(AddrSpace)) {
2249 switch (Scope) {
2250 case SIAtomicScope::SYSTEM:
2251 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2252 break;
2253 case SIAtomicScope::AGENT:
2254 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2255 break;
2256 case SIAtomicScope::CLUSTER:
2257 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2258 break;
2259 case SIAtomicScope::WORKGROUP:
2260 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2261 // different CUs that access different L0s.
2262 if (!ST.isCuModeEnabled())
2263 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2264 break;
2265 case SIAtomicScope::WAVEFRONT:
2266 case SIAtomicScope::SINGLETHREAD:
2267 // No cache to bypass.
2268 break;
2269 default:
2270 llvm_unreachable("Unsupported synchronization scope");
2271 }
2272 }
2273
2274 // The scratch address space does not need the global memory caches
2275 // to be bypassed as all memory operations by the same thread are
2276 // sequentially consistent, and no other thread can access scratch
2277 // memory.
2278
2279 // Other address spaces do not have a cache.
2280
2281 return Changed;
2282}
2283
2284bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2285 if (AtomicPseudoMIs.empty())
2286 return false;
2287
2288 for (auto &MI : AtomicPseudoMIs)
2289 MI->eraseFromParent();
2290
2291 AtomicPseudoMIs.clear();
2292 return true;
2293}
2294
2295bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2297 assert(MI->mayLoad() && !MI->mayStore());
2298
2299 LLVM_DEBUG(dbgs() << "Expanding load: " << *MI);
2300
2301 bool Changed = false;
2302
2303 if (MOI.isAtomic()) {
2304 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2305 << ", scope=" << toString(MOI.getScope())
2306 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2307 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2308 const AtomicOrdering Order = MOI.getOrdering();
2309 if (Order == AtomicOrdering::Monotonic ||
2310 Order == AtomicOrdering::Acquire ||
2311 Order == AtomicOrdering::SequentiallyConsistent) {
2312 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2313 MOI.getOrderingAddrSpace());
2314 }
2315
2316 // Handle cooperative atomics after cache bypass step, as it may override
2317 // the scope of the instruction to a greater scope.
2318 if (MOI.isCooperative())
2319 Changed |= CC->handleCooperativeAtomic(*MI);
2320
2321 if (Order == AtomicOrdering::SequentiallyConsistent)
2322 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2323 SIMemOp::LOAD | SIMemOp::STORE,
2324 MOI.getIsCrossAddressSpaceOrdering(),
2325 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2326
2327 if (Order == AtomicOrdering::Acquire ||
2328 Order == AtomicOrdering::SequentiallyConsistent) {
2329 // The wait below only needs to wait on the prior atomic.
2330 Changed |=
2331 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2332 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2333 Position::AFTER, Order, /*AtomicsOnly=*/true);
2334 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2335 MOI.getOrderingAddrSpace(),
2336 Position::AFTER);
2337 }
2338
2339 return Changed;
2340 }
2341
2342 // Atomic instructions already bypass caches to the scope specified by the
2343 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2344 // instructions need additional treatment.
2345 Changed |= CC->enableVolatileAndOrNonTemporal(
2346 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2347 MOI.isNonTemporal(), MOI.isLastUse());
2348
2349 return Changed;
2350}
2351
2352bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2354 assert(!MI->mayLoad() && MI->mayStore());
2355
2356 LLVM_DEBUG(dbgs() << "Expanding store: " << *MI);
2357
2358 bool Changed = false;
2359 // FIXME: Necessary hack because iterator can lose track of the store.
2360 MachineInstr &StoreMI = *MI;
2361
2362 if (MOI.isAtomic()) {
2363 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2364 << ", scope=" << toString(MOI.getScope())
2365 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2366 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2367 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2368 MOI.getOrdering() == AtomicOrdering::Release ||
2369 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2370 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2371 MOI.getOrderingAddrSpace());
2372 }
2373
2374 // Handle cooperative atomics after cache bypass step, as it may override
2375 // the scope of the instruction to a greater scope.
2376 if (MOI.isCooperative())
2377 Changed |= CC->handleCooperativeAtomic(*MI);
2378
2379 if (MOI.getOrdering() == AtomicOrdering::Release ||
2380 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2381 Changed |= CC->insertRelease(MI, MOI.getScope(),
2382 MOI.getOrderingAddrSpace(),
2383 MOI.getIsCrossAddressSpaceOrdering(),
2384 Position::BEFORE);
2385
2386 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2387 return Changed;
2388 }
2389
2390 // Atomic instructions already bypass caches to the scope specified by the
2391 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2392 // need additional treatment.
2393 Changed |= CC->enableVolatileAndOrNonTemporal(
2394 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2395 MOI.isNonTemporal());
2396
2397 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2398 // instruction field, do not confuse it with atomic scope.
2399 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2400 return Changed;
2401}
2402
2403bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2405 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2406
2407 LLVM_DEBUG(dbgs() << "Expanding atomic fence: " << *MI);
2408
2409 AtomicPseudoMIs.push_back(MI);
2410 bool Changed = false;
2411
2412 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2413
2414 if (MOI.isAtomic()) {
2415 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2416 << ", scope=" << toString(MOI.getScope())
2417 << ", ordering-AS=" << OrderingAddrSpace << "\n");
2418 const AtomicOrdering Order = MOI.getOrdering();
2419 if (Order == AtomicOrdering::Acquire) {
2420 // Acquire fences only need to wait on the previous atomic they pair with.
2421 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2422 SIMemOp::LOAD | SIMemOp::STORE,
2423 MOI.getIsCrossAddressSpaceOrdering(),
2424 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2425 }
2426
2427 if (Order == AtomicOrdering::Release ||
2428 Order == AtomicOrdering::AcquireRelease ||
2429 Order == AtomicOrdering::SequentiallyConsistent)
2430 /// TODO: This relies on a barrier always generating a waitcnt
2431 /// for LDS to ensure it is not reordered with the completion of
2432 /// the proceeding LDS operations. If barrier had a memory
2433 /// ordering and memory scope, then library does not need to
2434 /// generate a fence. Could add support in this file for
2435 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2436 /// adding S_WAITCNT before a S_BARRIER.
2437 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2438 MOI.getIsCrossAddressSpaceOrdering(),
2439 Position::BEFORE);
2440
2441 // TODO: If both release and invalidate are happening they could be combined
2442 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2443 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2444 // track cache invalidate and write back instructions.
2445
2446 if (Order == AtomicOrdering::Acquire ||
2447 Order == AtomicOrdering::AcquireRelease ||
2448 Order == AtomicOrdering::SequentiallyConsistent)
2449 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2450 Position::BEFORE);
2451
2452 return Changed;
2453 }
2454
2455 return Changed;
2456}
2457
2458bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2460 assert(MI->mayLoad() && MI->mayStore());
2461
2462 LLVM_DEBUG(dbgs() << "Expanding atomic cmpxchg/rmw: " << *MI);
2463
2464 bool Changed = false;
2465 MachineInstr &RMWMI = *MI;
2466
2467 if (MOI.isAtomic()) {
2468 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2469 << ", failure-ordering="
2470 << toIRString(MOI.getFailureOrdering())
2471 << ", scope=" << toString(MOI.getScope())
2472 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2473 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2474 const AtomicOrdering Order = MOI.getOrdering();
2475 if (Order == AtomicOrdering::Monotonic ||
2476 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2477 Order == AtomicOrdering::AcquireRelease ||
2478 Order == AtomicOrdering::SequentiallyConsistent) {
2479 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2480 MOI.getInstrAddrSpace());
2481 }
2482
2483 if (Order == AtomicOrdering::Release ||
2484 Order == AtomicOrdering::AcquireRelease ||
2485 Order == AtomicOrdering::SequentiallyConsistent ||
2486 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2487 Changed |= CC->insertRelease(MI, MOI.getScope(),
2488 MOI.getOrderingAddrSpace(),
2489 MOI.getIsCrossAddressSpaceOrdering(),
2490 Position::BEFORE);
2491
2492 if (Order == AtomicOrdering::Acquire ||
2493 Order == AtomicOrdering::AcquireRelease ||
2494 Order == AtomicOrdering::SequentiallyConsistent ||
2495 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2496 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2497 // Only wait on the previous atomic.
2498 Changed |=
2499 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2500 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2501 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2502 Order, /*AtomicsOnly=*/true);
2503 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2504 MOI.getOrderingAddrSpace(),
2505 Position::AFTER);
2506 }
2507
2508 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2509 return Changed;
2510 }
2511
2512 return Changed;
2513}
2514
2515bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2517 assert(MI->mayLoad() && MI->mayStore());
2518
2519 LLVM_DEBUG(dbgs() << "Expanding LDS DMA: " << *MI);
2520
2521 // The volatility or nontemporal-ness of the operation is a
2522 // function of the global memory, not the LDS.
2523 SIMemOp OpKind =
2524 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2525
2526 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2527 // stores. The operation is treated as a volatile/nontemporal store
2528 // to its second argument.
2529 return CC->enableVolatileAndOrNonTemporal(
2530 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2531 MOI.isNonTemporal(), MOI.isLastUse());
2532}
2533
2534bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2535 const MachineModuleInfo &MMI =
2536 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2537 return SIMemoryLegalizer(MMI).run(MF);
2538}
2539
2540PreservedAnalyses
2544 .getCachedResult<MachineModuleAnalysis>(
2545 *MF.getFunction().getParent());
2546 assert(MMI && "MachineModuleAnalysis must be available");
2547 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2548 return PreservedAnalyses::all();
2550}
2551
2552bool SIMemoryLegalizer::run(MachineFunction &MF) {
2553 bool Changed = false;
2554
2555 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2556 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
2557 MF.getFunction());
2558 CC = SICacheControl::create(ST);
2559
2560 for (auto &MBB : MF) {
2561 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2562
2563 // Unbundle instructions after the post-RA scheduler.
2564 if (MI->isBundle() && MI->mayLoadOrStore()) {
2565 MachineBasicBlock::instr_iterator II(MI->getIterator());
2566 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2567 I != E && I->isBundledWithPred(); ++I) {
2568 I->unbundleFromPred();
2569 for (MachineOperand &MO : I->operands())
2570 if (MO.isReg())
2571 MO.setIsInternalRead(false);
2572 }
2573
2574 MI = MI->eraseFromParent();
2575 }
2576
2577 if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
2578 if (const auto &MOI = MOA.getLoadInfo(MI))
2579 Changed |= expandLoad(*MOI, MI);
2580 else if (const auto &MOI = MOA.getStoreInfo(MI))
2581 Changed |= expandStore(*MOI, MI);
2582 else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
2583 Changed |= expandLDSDMA(*MOI, MI);
2584 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2585 Changed |= expandAtomicFence(*MOI, MI);
2586 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2587 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2588 }
2589
2591 Changed |= CC->handleNonVolatile(*MI);
2592 }
2593 }
2594
2595 Changed |= removeAtomicPseudoMIs();
2596 return Changed;
2597}
2598
2599INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2600
2601char SIMemoryLegalizerLegacy::ID = 0;
2602char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2603
2605 return new SIMemoryLegalizerLegacy();
2606}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
static bool isNonVolatileMemoryAccess(const MachineInstr &MI)
#define PASS_NAME
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST)
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
A description of a memory reference used in the backend.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isVMEM(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isBUF(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isAtomic(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
bool isGFX10(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()
bool isStrongerThan(AtomicOrdering AO, AtomicOrdering Other)
Returns true if ao is stronger than other as defined by the AtomicOrdering lattice,...