LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
31
32using namespace llvm;
33using namespace llvm::AMDGPU;
34
35#define DEBUG_TYPE "si-memory-legalizer"
36#define PASS_NAME "SI Memory Legalizer"
37
39 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
40 cl::desc("Use this to skip inserting cache invalidating instructions."));
41
42namespace {
43
45
46/// Memory operation flags. Can be ORed together.
47enum class SIMemOp {
48 NONE = 0u,
49 LOAD = 1u << 0,
50 STORE = 1u << 1,
51 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
52};
53
54/// Position to insert a new instruction relative to an existing
55/// instruction.
56enum class Position {
57 BEFORE,
58 AFTER
59};
60
61/// The atomic synchronization scopes supported by the AMDGPU target.
62enum class SIAtomicScope {
63 NONE,
64 SINGLETHREAD,
65 WAVEFRONT,
66 WORKGROUP,
67 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68 AGENT,
69 SYSTEM
70};
71
72/// The distinct address spaces supported by the AMDGPU target for
73/// atomic memory operation. Can be ORed together.
74enum class SIAtomicAddrSpace {
75 NONE = 0u,
76 GLOBAL = 1u << 0,
77 LDS = 1u << 1,
78 SCRATCH = 1u << 2,
79 GDS = 1u << 3,
80 OTHER = 1u << 4,
81
82 /// The address spaces that can be accessed by a FLAT instruction.
83 FLAT = GLOBAL | LDS | SCRATCH,
84
85 /// The address spaces that support atomic instructions.
86 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
87
88 /// All address spaces.
89 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
90
91 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
92};
93
94class SIMemOpInfo final {
95private:
96
97 friend class SIMemOpAccess;
98
99 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104 bool IsCrossAddressSpaceOrdering = false;
105 bool IsVolatile = false;
106 bool IsNonTemporal = false;
107 bool IsLastUse = false;
108 bool IsCooperative = false;
109
110 // TODO: Should we assume Cooperative=true if no MMO is present?
111 SIMemOpInfo(
112 const GCNSubtarget &ST,
113 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117 bool IsCrossAddressSpaceOrdering = true,
118 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119 bool IsVolatile = false, bool IsNonTemporal = false,
120 bool IsLastUse = false, bool IsCooperative = false)
121 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127 if (Ordering == AtomicOrdering::NotAtomic) {
128 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129 assert(Scope == SIAtomicScope::NONE &&
130 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131 !IsCrossAddressSpaceOrdering &&
132 FailureOrdering == AtomicOrdering::NotAtomic);
133 return;
134 }
135
136 assert(Scope != SIAtomicScope::NONE &&
137 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE &&
139 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140 SIAtomicAddrSpace::NONE);
141
142 // There is also no cross address space ordering if the ordering
143 // address space is the same as the instruction address space and
144 // only contains a single address space.
145 if ((OrderingAddrSpace == InstrAddrSpace) &&
146 isPowerOf2_32(uint32_t(InstrAddrSpace)))
147 this->IsCrossAddressSpaceOrdering = false;
148
149 // Limit the scope to the maximum supported by the instruction's address
150 // spaces.
151 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152 SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
154 } else if ((InstrAddrSpace &
155 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
156 SIAtomicAddrSpace::NONE) {
157 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
158 } else if ((InstrAddrSpace &
159 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
160 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
162 }
163
164 // On targets that have no concept of a workgroup cluster, use
165 // AGENT scope as a conservatively correct alternative.
166 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167 this->Scope = SIAtomicScope::AGENT;
168 }
169
170public:
171 /// \returns Atomic synchronization scope of the machine instruction used to
172 /// create this SIMemOpInfo.
173 SIAtomicScope getScope() const {
174 return Scope;
175 }
176
177 /// \returns Ordering constraint of the machine instruction used to
178 /// create this SIMemOpInfo.
179 AtomicOrdering getOrdering() const {
180 return Ordering;
181 }
182
183 /// \returns Failure ordering constraint of the machine instruction used to
184 /// create this SIMemOpInfo.
185 AtomicOrdering getFailureOrdering() const {
186 return FailureOrdering;
187 }
188
189 /// \returns The address spaces be accessed by the machine
190 /// instruction used to create this SIMemOpInfo.
191 SIAtomicAddrSpace getInstrAddrSpace() const {
192 return InstrAddrSpace;
193 }
194
195 /// \returns The address spaces that must be ordered by the machine
196 /// instruction used to create this SIMemOpInfo.
197 SIAtomicAddrSpace getOrderingAddrSpace() const {
198 return OrderingAddrSpace;
199 }
200
201 /// \returns Return true iff memory ordering of operations on
202 /// different address spaces is required.
203 bool getIsCrossAddressSpaceOrdering() const {
204 return IsCrossAddressSpaceOrdering;
205 }
206
207 /// \returns True if memory access of the machine instruction used to
208 /// create this SIMemOpInfo is volatile, false otherwise.
209 bool isVolatile() const {
210 return IsVolatile;
211 }
212
213 /// \returns True if memory access of the machine instruction used to
214 /// create this SIMemOpInfo is nontemporal, false otherwise.
215 bool isNonTemporal() const {
216 return IsNonTemporal;
217 }
218
219 /// \returns True if memory access of the machine instruction used to
220 /// create this SIMemOpInfo is last use, false otherwise.
221 bool isLastUse() const { return IsLastUse; }
222
223 /// \returns True if this is a cooperative load or store atomic.
224 bool isCooperative() const { return IsCooperative; }
225
226 /// \returns True if ordering constraint of the machine instruction used to
227 /// create this SIMemOpInfo is unordered or higher, false otherwise.
228 bool isAtomic() const {
229 return Ordering != AtomicOrdering::NotAtomic;
230 }
231
232};
233
234class SIMemOpAccess final {
235private:
236 const AMDGPUMachineModuleInfo *MMI = nullptr;
237 const GCNSubtarget &ST;
238
239 /// Reports unsupported message \p Msg for \p MI to LLVM context.
240 void reportUnsupported(const MachineBasicBlock::iterator &MI,
241 const char *Msg) const;
242
243 /// Inspects the target synchronization scope \p SSID and determines
244 /// the SI atomic scope it corresponds to, the address spaces it
245 /// covers, and whether the memory ordering applies between address
246 /// spaces.
247 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250 /// \return Return a bit set of the address spaces accessed by \p AS.
251 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253 /// \returns Info constructed from \p MI, which has at least machine memory
254 /// operand.
255 std::optional<SIMemOpInfo>
256 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258public:
259 /// Construct class to support accessing the machine memory operands
260 /// of instructions in the machine function \p MF.
261 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
266
267 /// \returns Store info if \p MI is a store operation, "std::nullopt"
268 /// otherwise.
269 std::optional<SIMemOpInfo>
270 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272 /// \returns Atomic fence info if \p MI is an atomic fence operation,
273 /// "std::nullopt" otherwise.
274 std::optional<SIMemOpInfo>
275 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278 /// rmw operation, "std::nullopt" otherwise.
279 std::optional<SIMemOpInfo>
280 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283 /// along with an indication of whether this is a load or store. If it is not
284 /// a direct-to-LDS operation, returns std::nullopt.
285 std::optional<SIMemOpInfo>
286 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287};
288
289class SICacheControl {
290protected:
291
292 /// AMDGPU subtarget info.
293 const GCNSubtarget &ST;
294
295 /// Instruction info.
296 const SIInstrInfo *TII = nullptr;
297
298 IsaVersion IV;
299
300 /// Whether to insert cache invalidating instructions.
301 bool InsertCacheInv;
302
303 SICacheControl(const GCNSubtarget &ST);
304
305 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
306 /// \returns Returns true if \p MI is modified, false otherwise.
307 bool enableCPolBits(const MachineBasicBlock::iterator MI,
308 unsigned Bits) const;
309
310 /// Check if any atomic operation on AS can affect memory accessible via the
311 /// global address space.
312 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314public:
315 using CPol = AMDGPU::CPol::CPol;
316
317 /// Create a cache control for the subtarget \p ST.
318 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
319
320 /// Update \p MI memory load instruction to bypass any caches up to
321 /// the \p Scope memory scope for address spaces \p
322 /// AddrSpace. Return true iff the instruction was modified.
323 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
324 SIAtomicScope Scope,
325 SIAtomicAddrSpace AddrSpace) const = 0;
326
327 /// Update \p MI memory store instruction to bypass any caches up to
328 /// the \p Scope memory scope for address spaces \p
329 /// AddrSpace. Return true iff the instruction was modified.
330 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
331 SIAtomicScope Scope,
332 SIAtomicAddrSpace AddrSpace) const = 0;
333
334 /// Update \p MI memory read-modify-write instruction to bypass any caches up
335 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
336 /// iff the instruction was modified.
337 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
338 SIAtomicScope Scope,
339 SIAtomicAddrSpace AddrSpace) const = 0;
340
341 /// Update \p MI memory instruction of kind \p Op associated with address
342 /// spaces \p AddrSpace to indicate it is volatile and/or
343 /// nontemporal/last-use. Return true iff the instruction was modified.
344 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345 SIAtomicAddrSpace AddrSpace,
346 SIMemOp Op, bool IsVolatile,
347 bool IsNonTemporal,
348 bool IsLastUse = false) const = 0;
349
350 /// Add final touches to a `mayStore` instruction \p MI, which may be a
351 /// Store or RMW instruction.
352 /// FIXME: This takes a MI because iterators aren't handled properly. When
353 /// this is called, they often point to entirely different insts. Thus we back
354 /// up the inst early and pass it here instead.
355 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
356 return false;
357 };
358
359 /// Handle cooperative load/store atomics.
360 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
362 "cooperative atomics are not available on this architecture");
363 }
364
365 /// Inserts any necessary instructions at position \p Pos relative
366 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
367 /// \p Op associated with address spaces \p AddrSpace have completed. Used
368 /// between memory instructions to enforce the order they become visible as
369 /// observed by other memory instructions executing in memory scope \p Scope.
370 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
371 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
372 /// that are used by atomic instructions.
373 /// Returns true iff any instructions inserted.
374 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
375 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
376 bool IsCrossAddrSpaceOrdering, Position Pos,
377 AtomicOrdering Order, bool AtomicsOnly) const = 0;
378
379 /// Inserts any necessary instructions at position \p Pos relative to
380 /// instruction \p MI to ensure any subsequent memory instructions of this
381 /// thread with address spaces \p AddrSpace will observe the previous memory
382 /// operations by any thread for memory scopes up to memory scope \p Scope .
383 /// Returns true iff any instructions inserted.
384 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
385 SIAtomicScope Scope,
386 SIAtomicAddrSpace AddrSpace,
387 Position Pos) const = 0;
388
389 /// Inserts any necessary instructions at position \p Pos relative to
390 /// instruction \p MI to ensure previous memory instructions by this thread
391 /// with address spaces \p AddrSpace have completed and can be observed by
392 /// subsequent memory instructions by any thread executing in memory scope \p
393 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
394 /// between address spaces. Returns true iff any instructions inserted.
395 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
396 SIAtomicScope Scope,
397 SIAtomicAddrSpace AddrSpace,
398 bool IsCrossAddrSpaceOrdering,
399 Position Pos) const = 0;
400
401 /// Virtual destructor to allow derivations to be deleted.
402 virtual ~SICacheControl() = default;
403};
404
405/// Generates code sequences for the memory model of all GFX targets below
406/// GFX10.
407class SIGfx6CacheControl final : public SICacheControl {
408public:
409
410 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
411
412 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
413 SIAtomicScope Scope,
414 SIAtomicAddrSpace AddrSpace) const override;
415
416 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
417 SIAtomicScope Scope,
418 SIAtomicAddrSpace AddrSpace) const override;
419
420 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace) const override;
423
424 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
425 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
426 bool IsVolatile, bool IsNonTemporal,
427 bool IsLastUse) const override;
428
429 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
430 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
431 bool IsCrossAddrSpaceOrdering, Position Pos,
432 AtomicOrdering Order, bool AtomicsOnly) const override;
433
434 bool insertAcquire(MachineBasicBlock::iterator &MI,
435 SIAtomicScope Scope,
436 SIAtomicAddrSpace AddrSpace,
437 Position Pos) const override;
438
439 bool insertRelease(MachineBasicBlock::iterator &MI,
440 SIAtomicScope Scope,
441 SIAtomicAddrSpace AddrSpace,
442 bool IsCrossAddrSpaceOrdering,
443 Position Pos) const override;
444};
445
446/// Generates code sequences for the memory model of GFX10/11.
447class SIGfx10CacheControl final : public SICacheControl {
448public:
449 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
450
451 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace) const override;
454
455 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
456 SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace) const override {
458 return false;
459 }
460
461 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace) const override {
464 return false;
465 }
466
467 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
468 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
469 bool IsVolatile, bool IsNonTemporal,
470 bool IsLastUse) const override;
471
472 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
473 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
474 bool IsCrossAddrSpaceOrdering, Position Pos,
475 AtomicOrdering Order, bool AtomicsOnly) const override;
476
477 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
478 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
479
480 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
481 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
482 Position Pos) const override {
483 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
484 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
485 /*AtomicsOnly=*/false);
486 }
487};
488
489class SIGfx12CacheControl final : public SICacheControl {
490protected:
491 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
492 // \returns Returns true if \p MI is modified, false otherwise.
493 bool setTH(const MachineBasicBlock::iterator MI,
495
496 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
497 // MI. \returns Returns true if \p MI is modified, false otherwise.
498 bool setScope(const MachineBasicBlock::iterator MI,
500
501 // Stores with system scope (SCOPE_SYS) need to wait for:
502 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
503 // - non-returning-atomics - wait for STORECNT==0
504 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
505 // since it does not distinguish atomics-with-return from regular stores.
506 // There is no need to wait if memory is cached (mtype != UC).
507 bool
508 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
509
510 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
511 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
512
513public:
514 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
515 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
516 // the behavior is the same if assuming GFX12.0 in CU mode.
517 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
518 }
519
520 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
521 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
522 bool IsCrossAddrSpaceOrdering, Position Pos,
523 AtomicOrdering Order, bool AtomicsOnly) const override;
524
525 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
526 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
527
528 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
529 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
530 bool IsVolatile, bool IsNonTemporal,
531 bool IsLastUse) const override;
532
533 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
534
535 bool handleCooperativeAtomic(MachineInstr &MI) const override;
536
537 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
539 Position Pos) const override;
540
541 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
542 SIAtomicScope Scope,
543 SIAtomicAddrSpace AddrSpace) const override {
544 return setAtomicScope(MI, Scope, AddrSpace);
545 }
546
547 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
548 SIAtomicScope Scope,
549 SIAtomicAddrSpace AddrSpace) const override {
550 return setAtomicScope(MI, Scope, AddrSpace);
551 }
552
553 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
554 SIAtomicScope Scope,
555 SIAtomicAddrSpace AddrSpace) const override {
556 return setAtomicScope(MI, Scope, AddrSpace);
557 }
558};
559
560class SIMemoryLegalizer final {
561private:
562 const MachineModuleInfo &MMI;
563 /// Cache Control.
564 std::unique_ptr<SICacheControl> CC = nullptr;
565
566 /// List of atomic pseudo instructions.
567 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
568
569 /// Return true iff instruction \p MI is a atomic instruction that
570 /// returns a result.
571 bool isAtomicRet(const MachineInstr &MI) const {
573 }
574
575 /// Removes all processed atomic pseudo instructions from the current
576 /// function. Returns true if current function is modified, false otherwise.
577 bool removeAtomicPseudoMIs();
578
579 /// Expands load operation \p MI. Returns true if instructions are
580 /// added/deleted or \p MI is modified, false otherwise.
581 bool expandLoad(const SIMemOpInfo &MOI,
583 /// Expands store operation \p MI. Returns true if instructions are
584 /// added/deleted or \p MI is modified, false otherwise.
585 bool expandStore(const SIMemOpInfo &MOI,
587 /// Expands atomic fence operation \p MI. Returns true if
588 /// instructions are added/deleted or \p MI is modified, false otherwise.
589 bool expandAtomicFence(const SIMemOpInfo &MOI,
591 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
592 /// instructions are added/deleted or \p MI is modified, false otherwise.
593 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
595 /// Expands LDS DMA operation \p MI. Returns true if instructions are
596 /// added/deleted or \p MI is modified, false otherwise.
597 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
598
599public:
600 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
601 bool run(MachineFunction &MF);
602};
603
604class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
605public:
606 static char ID;
607
608 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
609
610 void getAnalysisUsage(AnalysisUsage &AU) const override {
611 AU.setPreservesCFG();
613 }
614
615 StringRef getPassName() const override {
616 return PASS_NAME;
617 }
618
619 bool runOnMachineFunction(MachineFunction &MF) override;
620};
621
622static const StringMap<SIAtomicAddrSpace> ASNames = {{
623 {"global", SIAtomicAddrSpace::GLOBAL},
624 {"local", SIAtomicAddrSpace::LDS},
625}};
626
627void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
628 const MachineFunction *MF = MI.getMF();
629 const Function &Fn = MF->getFunction();
631 raw_svector_ostream OS(Str);
632 OS << "unknown address space '" << AS << "'; expected one of ";
634 for (const auto &[Name, Val] : ASNames)
635 OS << LS << '\'' << Name << '\'';
636 Fn.getContext().diagnose(
637 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
638}
639
640/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
641/// If this tag isn't present, or if it has no meaningful values, returns
642/// \p none, otherwise returns the address spaces specified by the MD.
643static std::optional<SIAtomicAddrSpace>
644getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
645 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
646
647 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
648 if (!MMRA)
649 return std::nullopt;
650
651 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
652 for (const auto &[Prefix, Suffix] : MMRA) {
653 if (Prefix != FenceASPrefix)
654 continue;
655
656 if (auto It = ASNames.find(Suffix); It != ASNames.end())
657 Result |= It->second;
658 else
659 diagnoseUnknownMMRAASName(MI, Suffix);
660 }
661
662 if (Result == SIAtomicAddrSpace::NONE)
663 return std::nullopt;
664
665 return Result;
666}
667
668} // end anonymous namespace
669
670void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
671 const char *Msg) const {
672 const Function &Func = MI->getMF()->getFunction();
673 Func.getContext().diagnose(
674 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
675}
676
677std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
678SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
679 SIAtomicAddrSpace InstrAddrSpace) const {
680 if (SSID == SyncScope::System)
681 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
682 if (SSID == MMI->getAgentSSID())
683 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
684 if (SSID == MMI->getClusterSSID())
685 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
686 if (SSID == MMI->getWorkgroupSSID())
687 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
688 true);
689 if (SSID == MMI->getWavefrontSSID())
690 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
691 true);
692 if (SSID == SyncScope::SingleThread)
693 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
694 true);
695 if (SSID == MMI->getSystemOneAddressSpaceSSID())
696 return std::tuple(SIAtomicScope::SYSTEM,
697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698 if (SSID == MMI->getAgentOneAddressSpaceSSID())
699 return std::tuple(SIAtomicScope::AGENT,
700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701 if (SSID == MMI->getClusterOneAddressSpaceSSID())
702 return std::tuple(SIAtomicScope::CLUSTER,
703 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
705 return std::tuple(SIAtomicScope::WORKGROUP,
706 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
708 return std::tuple(SIAtomicScope::WAVEFRONT,
709 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
711 return std::tuple(SIAtomicScope::SINGLETHREAD,
712 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713 return std::nullopt;
714}
715
716SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
717 if (AS == AMDGPUAS::FLAT_ADDRESS)
718 return SIAtomicAddrSpace::FLAT;
719 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
720 return SIAtomicAddrSpace::GLOBAL;
721 if (AS == AMDGPUAS::LOCAL_ADDRESS)
722 return SIAtomicAddrSpace::LDS;
724 return SIAtomicAddrSpace::SCRATCH;
725 if (AS == AMDGPUAS::REGION_ADDRESS)
726 return SIAtomicAddrSpace::GDS;
729 return SIAtomicAddrSpace::GLOBAL;
730
731 return SIAtomicAddrSpace::OTHER;
732}
733
734SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
735 const GCNSubtarget &ST)
736 : MMI(&MMI_), ST(ST) {}
737
738std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
739 const MachineBasicBlock::iterator &MI) const {
740 assert(MI->getNumMemOperands() > 0);
741
743 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
744 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
745 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746 bool IsNonTemporal = true;
747 bool IsVolatile = false;
748 bool IsLastUse = false;
749 bool IsCooperative = false;
750
751 // Validator should check whether or not MMOs cover the entire set of
752 // locations accessed by the memory instruction.
753 for (const auto &MMO : MI->memoperands()) {
754 IsNonTemporal &= MMO->isNonTemporal();
755 IsVolatile |= MMO->isVolatile();
756 IsLastUse |= MMO->getFlags() & MOLastUse;
757 IsCooperative |= MMO->getFlags() & MOCooperative;
758 InstrAddrSpace |=
759 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
760 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
761 if (OpOrdering != AtomicOrdering::NotAtomic) {
762 const auto &IsSyncScopeInclusion =
763 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
764 if (!IsSyncScopeInclusion) {
765 reportUnsupported(MI,
766 "Unsupported non-inclusive atomic synchronization scope");
767 return std::nullopt;
768 }
769
770 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
771 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
772 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
773 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
774 FailureOrdering =
775 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
776 }
777 }
778
779 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
780 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
781 // here, but the lowering should really be cleaned up at some point.
782 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
783 SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
784 Ordering = AtomicOrdering::Monotonic;
785
786 SIAtomicScope Scope = SIAtomicScope::NONE;
787 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
788 bool IsCrossAddressSpaceOrdering = false;
789 if (Ordering != AtomicOrdering::NotAtomic) {
790 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
791 if (!ScopeOrNone) {
792 reportUnsupported(MI, "Unsupported atomic synchronization scope");
793 return std::nullopt;
794 }
795 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
796 *ScopeOrNone;
797 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
798 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
799 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
800 reportUnsupported(MI, "Unsupported atomic address space");
801 return std::nullopt;
802 }
803 }
804 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
805 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
806 IsNonTemporal, IsLastUse, IsCooperative);
807}
808
809std::optional<SIMemOpInfo>
810SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
811 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
812
813 if (!(MI->mayLoad() && !MI->mayStore()))
814 return std::nullopt;
815
816 // Be conservative if there are no memory operands.
817 if (MI->getNumMemOperands() == 0)
818 return SIMemOpInfo(ST);
819
820 return constructFromMIWithMMO(MI);
821}
822
823std::optional<SIMemOpInfo>
824SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
825 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
826
827 if (!(!MI->mayLoad() && MI->mayStore()))
828 return std::nullopt;
829
830 // Be conservative if there are no memory operands.
831 if (MI->getNumMemOperands() == 0)
832 return SIMemOpInfo(ST);
833
834 return constructFromMIWithMMO(MI);
835}
836
837std::optional<SIMemOpInfo>
838SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
839 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
840
841 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
842 return std::nullopt;
843
845 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
846
847 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
848 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
849 if (!ScopeOrNone) {
850 reportUnsupported(MI, "Unsupported atomic synchronization scope");
851 return std::nullopt;
852 }
853
854 SIAtomicScope Scope = SIAtomicScope::NONE;
855 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
856 bool IsCrossAddressSpaceOrdering = false;
857 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
858 *ScopeOrNone;
859
860 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
861 // We currently expect refineOrderingAS to be the only place that
862 // can refine the AS ordered by the fence.
863 // If that changes, we need to review the semantics of that function
864 // in case it needs to preserve certain address spaces.
865 reportUnsupported(MI, "Unsupported atomic address space");
866 return std::nullopt;
867 }
868
869 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
870 if (SynchronizeAS)
871 OrderingAddrSpace = *SynchronizeAS;
872
873 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
874 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
875 AtomicOrdering::NotAtomic);
876}
877
878std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
879 const MachineBasicBlock::iterator &MI) const {
880 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
881
882 if (!(MI->mayLoad() && MI->mayStore()))
883 return std::nullopt;
884
885 // Be conservative if there are no memory operands.
886 if (MI->getNumMemOperands() == 0)
887 return SIMemOpInfo(ST);
888
889 return constructFromMIWithMMO(MI);
890}
891
892std::optional<SIMemOpInfo>
893SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
894 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
895
897 return std::nullopt;
898
899 return constructFromMIWithMMO(MI);
900}
901
902SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
903 TII = ST.getInstrInfo();
904 IV = getIsaVersion(ST.getCPU());
905 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
906}
907
908bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
909 unsigned Bits) const {
910 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
911 if (!CPol)
912 return false;
913
914 CPol->setImm(CPol->getImm() | Bits);
915 return true;
916}
917
918bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
919 assert((!ST.hasGloballyAddressableScratch() ||
920 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
921 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
922 "scratch instructions should already be replaced by flat "
923 "instructions if GloballyAddressableScratch is enabled");
924 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
925}
926
927/* static */
928std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
929 GCNSubtarget::Generation Generation = ST.getGeneration();
930 if (Generation < AMDGPUSubtarget::GFX10)
931 return std::make_unique<SIGfx6CacheControl>(ST);
932 if (Generation < AMDGPUSubtarget::GFX12)
933 return std::make_unique<SIGfx10CacheControl>(ST);
934 return std::make_unique<SIGfx12CacheControl>(ST);
935}
936
937bool SIGfx6CacheControl::enableLoadCacheBypass(
939 SIAtomicScope Scope,
940 SIAtomicAddrSpace AddrSpace) const {
941 assert(MI->mayLoad() && !MI->mayStore());
942
943 if (!canAffectGlobalAddrSpace(AddrSpace)) {
944 /// The scratch address space does not need the global memory caches
945 /// to be bypassed as all memory operations by the same thread are
946 /// sequentially consistent, and no other thread can access scratch
947 /// memory.
948
949 /// Other address spaces do not have a cache.
950 return false;
951 }
952
953 bool Changed = false;
954 switch (Scope) {
955 case SIAtomicScope::SYSTEM:
956 if (ST.hasGFX940Insts()) {
957 // Set SC bits to indicate system scope.
958 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
959 break;
960 }
961 [[fallthrough]];
962 case SIAtomicScope::AGENT:
963 if (ST.hasGFX940Insts()) {
964 // Set SC bits to indicate agent scope.
965 Changed |= enableCPolBits(MI, CPol::SC1);
966 } else {
967 // Set L1 cache policy to MISS_EVICT.
968 // Note: there is no L2 cache bypass policy at the ISA level.
969 Changed |= enableCPolBits(MI, CPol::GLC);
970 }
971 break;
972 case SIAtomicScope::WORKGROUP:
973 if (ST.hasGFX940Insts()) {
974 // In threadgroup split mode the waves of a work-group can be executing
975 // on different CUs. Therefore need to bypass the L1 which is per CU.
976 // Otherwise in non-threadgroup split mode all waves of a work-group are
977 // on the same CU, and so the L1 does not need to be bypassed. Setting
978 // SC bits to indicate work-group scope will do this automatically.
979 Changed |= enableCPolBits(MI, CPol::SC0);
980 } else if (ST.hasGFX90AInsts()) {
981 // In threadgroup split mode the waves of a work-group can be executing
982 // on different CUs. Therefore need to bypass the L1 which is per CU.
983 // Otherwise in non-threadgroup split mode all waves of a work-group are
984 // on the same CU, and so the L1 does not need to be bypassed.
985 if (ST.isTgSplitEnabled())
986 Changed |= enableCPolBits(MI, CPol::GLC);
987 }
988 break;
989 case SIAtomicScope::WAVEFRONT:
990 case SIAtomicScope::SINGLETHREAD:
991 // No cache to bypass.
992 break;
993 default:
994 llvm_unreachable("Unsupported synchronization scope");
995 }
996
997 return Changed;
998}
999
1000bool SIGfx6CacheControl::enableStoreCacheBypass(
1002 SIAtomicScope Scope,
1003 SIAtomicAddrSpace AddrSpace) const {
1004 assert(!MI->mayLoad() && MI->mayStore());
1005 bool Changed = false;
1006
1007 /// For targets other than GFX940, the L1 cache is write through so does not
1008 /// need to be bypassed. There is no bypass control for the L2 cache at the
1009 /// isa level.
1010
1011 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1012 switch (Scope) {
1013 case SIAtomicScope::SYSTEM:
1014 // Set SC bits to indicate system scope.
1015 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1016 break;
1017 case SIAtomicScope::AGENT:
1018 // Set SC bits to indicate agent scope.
1019 Changed |= enableCPolBits(MI, CPol::SC1);
1020 break;
1021 case SIAtomicScope::WORKGROUP:
1022 // Set SC bits to indicate workgroup scope.
1023 Changed |= enableCPolBits(MI, CPol::SC0);
1024 break;
1025 case SIAtomicScope::WAVEFRONT:
1026 case SIAtomicScope::SINGLETHREAD:
1027 // Leave SC bits unset to indicate wavefront scope.
1028 break;
1029 default:
1030 llvm_unreachable("Unsupported synchronization scope");
1031 }
1032
1033 /// The scratch address space does not need the global memory caches
1034 /// to be bypassed as all memory operations by the same thread are
1035 /// sequentially consistent, and no other thread can access scratch
1036 /// memory.
1037
1038 /// Other address spaces do not have a cache.
1039 }
1040
1041 return Changed;
1042}
1043
1044bool SIGfx6CacheControl::enableRMWCacheBypass(
1046 SIAtomicScope Scope,
1047 SIAtomicAddrSpace AddrSpace) const {
1048 assert(MI->mayLoad() && MI->mayStore());
1049 bool Changed = false;
1050
1051 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1052 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1053 /// indicate if they are return or no-return. Note: there is no L2 cache
1054 /// coherent bypass control at the ISA level.
1055 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1056
1057 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1058 switch (Scope) {
1059 case SIAtomicScope::SYSTEM:
1060 // Set SC1 bit to indicate system scope.
1061 Changed |= enableCPolBits(MI, CPol::SC1);
1062 break;
1063 case SIAtomicScope::AGENT:
1064 case SIAtomicScope::WORKGROUP:
1065 case SIAtomicScope::WAVEFRONT:
1066 case SIAtomicScope::SINGLETHREAD:
1067 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1068 // to indicate system or agent scope. The SC0 bit is used to indicate if
1069 // they are return or no-return. Leave SC1 bit unset to indicate agent
1070 // scope.
1071 break;
1072 default:
1073 llvm_unreachable("Unsupported synchronization scope");
1074 }
1075 }
1076
1077 return Changed;
1078}
1079
1080bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1081 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1082 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1083 // Only handle load and store, not atomic read-modify-write insructions. The
1084 // latter use glc to indicate if the atomic returns a result and so must not
1085 // be used for cache control.
1086 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1087
1088 // Only update load and store, not LLVM IR atomic read-modify-write
1089 // instructions. The latter are always marked as volatile so cannot sensibly
1090 // handle it as do not want to pessimize all atomics. Also they do not support
1091 // the nontemporal attribute.
1092 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1093
1094 bool Changed = false;
1095
1096 if (IsVolatile) {
1097 if (ST.hasGFX940Insts()) {
1098 // Set SC bits to indicate system scope.
1099 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1100 } else if (Op == SIMemOp::LOAD) {
1101 // Set L1 cache policy to be MISS_EVICT for load instructions
1102 // and MISS_LRU for store instructions.
1103 // Note: there is no L2 cache bypass policy at the ISA level.
1104 Changed |= enableCPolBits(MI, CPol::GLC);
1105 }
1106
1107 // Ensure operation has completed at system scope to cause all volatile
1108 // operations to be visible outside the program in a global order. Do not
1109 // request cross address space as only the global address space can be
1110 // observable outside the program, so no need to cause a waitcnt for LDS
1111 // address space operations.
1112 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1113 Position::AFTER, AtomicOrdering::Unordered,
1114 /*AtomicsOnly=*/false);
1115
1116 return Changed;
1117 }
1118
1119 if (IsNonTemporal) {
1120 if (ST.hasGFX940Insts()) {
1121 Changed |= enableCPolBits(MI, CPol::NT);
1122 } else {
1123 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1124 // for both loads and stores, and the L2 cache policy to STREAM.
1125 Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
1126 }
1127 return Changed;
1128 }
1129
1130 return Changed;
1131}
1132
1133bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1134 SIAtomicScope Scope,
1135 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1136 bool IsCrossAddrSpaceOrdering, Position Pos,
1137 AtomicOrdering Order,
1138 bool AtomicsOnly) const {
1139 bool Changed = false;
1140
1141 MachineBasicBlock &MBB = *MI->getParent();
1142 DebugLoc DL = MI->getDebugLoc();
1143
1144 if (Pos == Position::AFTER)
1145 ++MI;
1146
1147 // GFX90A+
1148 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1149 // In threadgroup split mode the waves of a work-group can be executing on
1150 // different CUs. Therefore need to wait for global or GDS memory operations
1151 // to complete to ensure they are visible to waves in the other CUs.
1152 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1153 // the same CU, so no need to wait for global memory as all waves in the
1154 // work-group access the same the L1, nor wait for GDS as access are ordered
1155 // on a CU.
1156 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1157 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1158 (Scope == SIAtomicScope::WORKGROUP)) {
1159 // Same as <GFX90A at AGENT scope;
1160 Scope = SIAtomicScope::AGENT;
1161 }
1162 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1163 // LDS memory operations.
1164 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1165 }
1166
1167 bool VMCnt = false;
1168 bool LGKMCnt = false;
1169
1170 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1171 SIAtomicAddrSpace::NONE) {
1172 switch (Scope) {
1173 case SIAtomicScope::SYSTEM:
1174 case SIAtomicScope::AGENT:
1175 VMCnt |= true;
1176 break;
1177 case SIAtomicScope::WORKGROUP:
1178 case SIAtomicScope::WAVEFRONT:
1179 case SIAtomicScope::SINGLETHREAD:
1180 // The L1 cache keeps all memory operations in order for
1181 // wavefronts in the same work-group.
1182 break;
1183 default:
1184 llvm_unreachable("Unsupported synchronization scope");
1185 }
1186 }
1187
1188 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1189 switch (Scope) {
1190 case SIAtomicScope::SYSTEM:
1191 case SIAtomicScope::AGENT:
1192 case SIAtomicScope::WORKGROUP:
1193 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1194 // not needed as LDS operations for all waves are executed in a total
1195 // global ordering as observed by all waves. Required if also
1196 // synchronizing with global/GDS memory as LDS operations could be
1197 // reordered with respect to later global/GDS memory operations of the
1198 // same wave.
1199 LGKMCnt |= IsCrossAddrSpaceOrdering;
1200 break;
1201 case SIAtomicScope::WAVEFRONT:
1202 case SIAtomicScope::SINGLETHREAD:
1203 // The LDS keeps all memory operations in order for
1204 // the same wavefront.
1205 break;
1206 default:
1207 llvm_unreachable("Unsupported synchronization scope");
1208 }
1209 }
1210
1211 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1212 switch (Scope) {
1213 case SIAtomicScope::SYSTEM:
1214 case SIAtomicScope::AGENT:
1215 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1216 // is not needed as GDS operations for all waves are executed in a total
1217 // global ordering as observed by all waves. Required if also
1218 // synchronizing with global/LDS memory as GDS operations could be
1219 // reordered with respect to later global/LDS memory operations of the
1220 // same wave.
1221 LGKMCnt |= IsCrossAddrSpaceOrdering;
1222 break;
1223 case SIAtomicScope::WORKGROUP:
1224 case SIAtomicScope::WAVEFRONT:
1225 case SIAtomicScope::SINGLETHREAD:
1226 // The GDS keeps all memory operations in order for
1227 // the same work-group.
1228 break;
1229 default:
1230 llvm_unreachable("Unsupported synchronization scope");
1231 }
1232 }
1233
1234 if (VMCnt || LGKMCnt) {
1235 unsigned WaitCntImmediate =
1237 VMCnt ? 0 : getVmcntBitMask(IV),
1239 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1240 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1241 .addImm(WaitCntImmediate);
1242 Changed = true;
1243 }
1244
1245 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1246 // at workgroup-scoped release operations that specify the LDS address space.
1247 // SIInsertWaitcnts will later replace this with a vmcnt().
1248 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1249 Scope == SIAtomicScope::WORKGROUP &&
1250 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1251 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1252 Changed = true;
1253 }
1254
1255 if (Pos == Position::AFTER)
1256 --MI;
1257
1258 return Changed;
1259}
1260
1262 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1263 return false;
1264 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1265}
1266
1267bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1268 SIAtomicScope Scope,
1269 SIAtomicAddrSpace AddrSpace,
1270 Position Pos) const {
1271 if (!InsertCacheInv)
1272 return false;
1273
1274 bool Changed = false;
1275
1276 MachineBasicBlock &MBB = *MI->getParent();
1277 DebugLoc DL = MI->getDebugLoc();
1278
1279 if (Pos == Position::AFTER)
1280 ++MI;
1281
1282 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1283 ? AMDGPU::BUFFER_WBINVL1_VOL
1284 : AMDGPU::BUFFER_WBINVL1;
1285
1286 if (canAffectGlobalAddrSpace(AddrSpace)) {
1287 switch (Scope) {
1288 case SIAtomicScope::SYSTEM:
1289 if (ST.hasGFX940Insts()) {
1290 // Ensures that following loads will not see stale remote VMEM data or
1291 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1292 // and CC will never be stale due to the local memory probes.
1293 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1294 // Set SC bits to indicate system scope.
1296 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1297 // hardware does not reorder memory operations by the same wave with
1298 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1299 // remove any cache lines of earlier writes by the same wave and ensures
1300 // later reads by the same wave will refetch the cache lines.
1301 Changed = true;
1302 break;
1303 }
1304
1305 if (ST.hasGFX90AInsts()) {
1306 // Ensures that following loads will not see stale remote VMEM data or
1307 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1308 // and CC will never be stale due to the local memory probes.
1309 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1310 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1311 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1312 // hardware does not reorder memory operations by the same wave with
1313 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1314 // to remove any cache lines of earlier writes by the same wave and
1315 // ensures later reads by the same wave will refetch the cache lines.
1316 Changed = true;
1317 break;
1318 }
1319 [[fallthrough]];
1320 case SIAtomicScope::AGENT:
1321 if (ST.hasGFX940Insts()) {
1322 // Ensures that following loads will not see stale remote date or local
1323 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1324 // stale due to the memory probes.
1325 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1326 // Set SC bits to indicate agent scope.
1328 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1329 // does not reorder memory operations with respect to preceeding buffer
1330 // invalidate. The invalidate is guaranteed to remove any cache lines of
1331 // earlier writes and ensures later writes will refetch the cache lines.
1332 } else
1333 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1334 Changed = true;
1335 break;
1336 case SIAtomicScope::WORKGROUP:
1337 if (ST.isTgSplitEnabled()) {
1338 if (ST.hasGFX940Insts()) {
1339 // In threadgroup split mode the waves of a work-group can be
1340 // executing on different CUs. Therefore need to invalidate the L1
1341 // which is per CU. Otherwise in non-threadgroup split mode all waves
1342 // of a work-group are on the same CU, and so the L1 does not need to
1343 // be invalidated.
1344
1345 // Ensures L1 is invalidated if in threadgroup split mode. In
1346 // non-threadgroup split mode it is a NOP, but no point generating it
1347 // in that case if know not in that mode.
1348 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1349 // Set SC bits to indicate work-group scope.
1351 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1352 // does not reorder memory operations with respect to preceeding
1353 // buffer invalidate. The invalidate is guaranteed to remove any cache
1354 // lines of earlier writes and ensures later writes will refetch the
1355 // cache lines.
1356 Changed = true;
1357 } else if (ST.hasGFX90AInsts()) {
1358 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1359 Changed = true;
1360 }
1361 }
1362 break;
1363 case SIAtomicScope::WAVEFRONT:
1364 case SIAtomicScope::SINGLETHREAD:
1365 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1366 // there are no caches to invalidate. All other targets have no cache to
1367 // invalidate.
1368 break;
1369 default:
1370 llvm_unreachable("Unsupported synchronization scope");
1371 }
1372 }
1373
1374 /// The scratch address space does not need the global memory cache
1375 /// to be flushed as all memory operations by the same thread are
1376 /// sequentially consistent, and no other thread can access scratch
1377 /// memory.
1378
1379 /// Other address spaces do not have a cache.
1380
1381 if (Pos == Position::AFTER)
1382 --MI;
1383
1384 return Changed;
1385}
1386
1387bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1388 SIAtomicScope Scope,
1389 SIAtomicAddrSpace AddrSpace,
1390 bool IsCrossAddrSpaceOrdering,
1391 Position Pos) const {
1392 bool Changed = false;
1393
1394 if (ST.hasGFX90AInsts()) {
1395 MachineBasicBlock &MBB = *MI->getParent();
1396 const DebugLoc &DL = MI->getDebugLoc();
1397
1398 if (Pos == Position::AFTER)
1399 ++MI;
1400
1401 if (canAffectGlobalAddrSpace(AddrSpace)) {
1402 switch (Scope) {
1403 case SIAtomicScope::SYSTEM:
1404 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1405 // hardware does not reorder memory operations by the same wave with
1406 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1407 // to initiate writeback of any dirty cache lines of earlier writes by
1408 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1409 // writeback has completed.
1410 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1411 // Set SC bits to indicate system scope.
1413 Changed = true;
1414 break;
1415 case SIAtomicScope::AGENT:
1416 if (ST.hasGFX940Insts()) {
1417 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1418 // Set SC bits to indicate agent scope.
1420
1421 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1422 // SIAtomicScope::AGENT, the following insertWait will generate the
1423 // required "S_WAITCNT vmcnt(0)".
1424 Changed = true;
1425 }
1426 break;
1427 case SIAtomicScope::WORKGROUP:
1428 case SIAtomicScope::WAVEFRONT:
1429 case SIAtomicScope::SINGLETHREAD:
1430 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1431 // would writeback, and would require an otherwise unnecessary
1432 // "S_WAITCNT vmcnt(0)".
1433 break;
1434 default:
1435 llvm_unreachable("Unsupported synchronization scope");
1436 }
1437 }
1438
1439 if (Pos == Position::AFTER)
1440 --MI;
1441 }
1442
1443 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1444 // S_WAITCNT needed.
1445 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1446 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1447 /*AtomicsOnly=*/false);
1448
1449 return Changed;
1450}
1451
1452bool SIGfx10CacheControl::enableLoadCacheBypass(
1453 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1454 SIAtomicAddrSpace AddrSpace) const {
1455 assert(MI->mayLoad() && !MI->mayStore());
1456 bool Changed = false;
1457
1458 if (canAffectGlobalAddrSpace(AddrSpace)) {
1459 switch (Scope) {
1460 case SIAtomicScope::SYSTEM:
1461 case SIAtomicScope::AGENT:
1462 // Set the L0 and L1 cache policies to MISS_EVICT.
1463 // Note: there is no L2 cache coherent bypass control at the ISA level.
1464 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1465 Changed |=
1466 enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
1467 break;
1468 case SIAtomicScope::WORKGROUP:
1469 // In WGP mode the waves of a work-group can be executing on either CU of
1470 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1471 // CU mode all waves of a work-group are on the same CU, and so the L0
1472 // does not need to be bypassed.
1473 if (!ST.isCuModeEnabled())
1474 Changed |= enableCPolBits(MI, CPol::GLC);
1475 break;
1476 case SIAtomicScope::WAVEFRONT:
1477 case SIAtomicScope::SINGLETHREAD:
1478 // No cache to bypass.
1479 break;
1480 default:
1481 llvm_unreachable("Unsupported synchronization scope");
1482 }
1483 }
1484
1485 /// The scratch address space does not need the global memory caches
1486 /// to be bypassed as all memory operations by the same thread are
1487 /// sequentially consistent, and no other thread can access scratch
1488 /// memory.
1489
1490 /// Other address spaces do not have a cache.
1491
1492 return Changed;
1493}
1494
1495bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1496 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1497 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1498
1499 // Only handle load and store, not atomic read-modify-write insructions. The
1500 // latter use glc to indicate if the atomic returns a result and so must not
1501 // be used for cache control.
1502 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1503
1504 // Only update load and store, not LLVM IR atomic read-modify-write
1505 // instructions. The latter are always marked as volatile so cannot sensibly
1506 // handle it as do not want to pessimize all atomics. Also they do not support
1507 // the nontemporal attribute.
1508 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1509
1510 bool Changed = false;
1511
1512 if (IsVolatile) {
1513 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1514 // and MISS_LRU for store instructions.
1515 // Note: there is no L2 cache coherent bypass control at the ISA level.
1516 if (Op == SIMemOp::LOAD) {
1517 Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1518 }
1519
1520 // GFX11: Set MALL NOALLOC for both load and store instructions.
1521 if (AMDGPU::isGFX11(ST))
1522 Changed |= enableCPolBits(MI, CPol::DLC);
1523
1524 // Ensure operation has completed at system scope to cause all volatile
1525 // operations to be visible outside the program in a global order. Do not
1526 // request cross address space as only the global address space can be
1527 // observable outside the program, so no need to cause a waitcnt for LDS
1528 // address space operations.
1529 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1530 Position::AFTER, AtomicOrdering::Unordered,
1531 /*AtomicsOnly=*/false);
1532 return Changed;
1533 }
1534
1535 if (IsNonTemporal) {
1536 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1537 // and L2 cache policy to STREAM.
1538 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1539 // to MISS_EVICT and the L2 cache policy to STREAM.
1540 if (Op == SIMemOp::STORE)
1541 Changed |= enableCPolBits(MI, CPol::GLC);
1542 Changed |= enableCPolBits(MI, CPol::SLC);
1543
1544 // GFX11: Set MALL NOALLOC for both load and store instructions.
1545 if (AMDGPU::isGFX11(ST))
1546 Changed |= enableCPolBits(MI, CPol::DLC);
1547
1548 return Changed;
1549 }
1550
1551 return Changed;
1552}
1553
1554bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1555 SIAtomicScope Scope,
1556 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1557 bool IsCrossAddrSpaceOrdering,
1558 Position Pos, AtomicOrdering Order,
1559 bool AtomicsOnly) const {
1560 bool Changed = false;
1561
1562 MachineBasicBlock &MBB = *MI->getParent();
1563 DebugLoc DL = MI->getDebugLoc();
1564
1565 if (Pos == Position::AFTER)
1566 ++MI;
1567
1568 bool VMCnt = false;
1569 bool VSCnt = false;
1570 bool LGKMCnt = false;
1571
1572 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1573 SIAtomicAddrSpace::NONE) {
1574 switch (Scope) {
1575 case SIAtomicScope::SYSTEM:
1576 case SIAtomicScope::AGENT:
1577 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1578 VMCnt |= true;
1579 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1580 VSCnt |= true;
1581 break;
1582 case SIAtomicScope::WORKGROUP:
1583 // In WGP mode the waves of a work-group can be executing on either CU of
1584 // the WGP. Therefore need to wait for operations to complete to ensure
1585 // they are visible to waves in the other CU as the L0 is per CU.
1586 // Otherwise in CU mode and all waves of a work-group are on the same CU
1587 // which shares the same L0. Note that we still need to wait when
1588 // performing a release in this mode to respect the transitivity of
1589 // happens-before, e.g. other waves of the workgroup must be able to
1590 // release the memory from another wave at a wider scope.
1591 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
1592 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1593 VMCnt |= true;
1594 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1595 VSCnt |= true;
1596 }
1597 break;
1598 case SIAtomicScope::WAVEFRONT:
1599 case SIAtomicScope::SINGLETHREAD:
1600 // The L0 cache keeps all memory operations in order for
1601 // work-items in the same wavefront.
1602 break;
1603 default:
1604 llvm_unreachable("Unsupported synchronization scope");
1605 }
1606 }
1607
1608 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1609 switch (Scope) {
1610 case SIAtomicScope::SYSTEM:
1611 case SIAtomicScope::AGENT:
1612 case SIAtomicScope::WORKGROUP:
1613 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1614 // not needed as LDS operations for all waves are executed in a total
1615 // global ordering as observed by all waves. Required if also
1616 // synchronizing with global/GDS memory as LDS operations could be
1617 // reordered with respect to later global/GDS memory operations of the
1618 // same wave.
1619 LGKMCnt |= IsCrossAddrSpaceOrdering;
1620 break;
1621 case SIAtomicScope::WAVEFRONT:
1622 case SIAtomicScope::SINGLETHREAD:
1623 // The LDS keeps all memory operations in order for
1624 // the same wavefront.
1625 break;
1626 default:
1627 llvm_unreachable("Unsupported synchronization scope");
1628 }
1629 }
1630
1631 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1632 switch (Scope) {
1633 case SIAtomicScope::SYSTEM:
1634 case SIAtomicScope::AGENT:
1635 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1636 // is not needed as GDS operations for all waves are executed in a total
1637 // global ordering as observed by all waves. Required if also
1638 // synchronizing with global/LDS memory as GDS operations could be
1639 // reordered with respect to later global/LDS memory operations of the
1640 // same wave.
1641 LGKMCnt |= IsCrossAddrSpaceOrdering;
1642 break;
1643 case SIAtomicScope::WORKGROUP:
1644 case SIAtomicScope::WAVEFRONT:
1645 case SIAtomicScope::SINGLETHREAD:
1646 // The GDS keeps all memory operations in order for
1647 // the same work-group.
1648 break;
1649 default:
1650 llvm_unreachable("Unsupported synchronization scope");
1651 }
1652 }
1653
1654 if (VMCnt || LGKMCnt) {
1655 unsigned WaitCntImmediate =
1657 VMCnt ? 0 : getVmcntBitMask(IV),
1659 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1660 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1661 .addImm(WaitCntImmediate);
1662 Changed = true;
1663 }
1664
1665 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1666 // at workgroup-scoped release operations that specify the LDS address space.
1667 // SIInsertWaitcnts will later replace this with a vmcnt().
1668 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1669 Scope == SIAtomicScope::WORKGROUP &&
1670 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1671 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1672 Changed = true;
1673 }
1674
1675 if (VSCnt) {
1676 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1677 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1678 .addImm(0);
1679 Changed = true;
1680 }
1681
1682 if (Pos == Position::AFTER)
1683 --MI;
1684
1685 return Changed;
1686}
1687
1688bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1689 SIAtomicScope Scope,
1690 SIAtomicAddrSpace AddrSpace,
1691 Position Pos) const {
1692 if (!InsertCacheInv)
1693 return false;
1694
1695 bool Changed = false;
1696
1697 MachineBasicBlock &MBB = *MI->getParent();
1698 DebugLoc DL = MI->getDebugLoc();
1699
1700 if (Pos == Position::AFTER)
1701 ++MI;
1702
1703 if (canAffectGlobalAddrSpace(AddrSpace)) {
1704 switch (Scope) {
1705 case SIAtomicScope::SYSTEM:
1706 case SIAtomicScope::AGENT:
1707 // The order of invalidates matter here. We must invalidate "outer in"
1708 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1709 // invalidated.
1710 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1711 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1712 Changed = true;
1713 break;
1714 case SIAtomicScope::WORKGROUP:
1715 // In WGP mode the waves of a work-group can be executing on either CU of
1716 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1717 // in CU mode and all waves of a work-group are on the same CU, and so the
1718 // L0 does not need to be invalidated.
1719 if (!ST.isCuModeEnabled()) {
1720 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1721 Changed = true;
1722 }
1723 break;
1724 case SIAtomicScope::WAVEFRONT:
1725 case SIAtomicScope::SINGLETHREAD:
1726 // No cache to invalidate.
1727 break;
1728 default:
1729 llvm_unreachable("Unsupported synchronization scope");
1730 }
1731 }
1732
1733 /// The scratch address space does not need the global memory cache
1734 /// to be flushed as all memory operations by the same thread are
1735 /// sequentially consistent, and no other thread can access scratch
1736 /// memory.
1737
1738 /// Other address spaces do not have a cache.
1739
1740 if (Pos == Position::AFTER)
1741 --MI;
1742
1743 return Changed;
1744}
1745
1746bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1747 AMDGPU::CPol::CPol Value) const {
1748 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1749 if (!CPol)
1750 return false;
1751
1752 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1753 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1754 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1755 return true;
1756 }
1757
1758 return false;
1759}
1760
1761bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1762 AMDGPU::CPol::CPol Value) const {
1763 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1764 if (!CPol)
1765 return false;
1766
1767 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1768 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1769 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1770 return true;
1771 }
1772
1773 return false;
1774}
1775
1776bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1777 const MachineBasicBlock::iterator MI) const {
1778 // TODO: implement flag for frontend to give us a hint not to insert waits.
1779
1780 MachineBasicBlock &MBB = *MI->getParent();
1781 const DebugLoc &DL = MI->getDebugLoc();
1782
1783 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
1784 if (ST.hasImageInsts()) {
1785 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
1786 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
1787 }
1788 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
1789 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
1790
1791 return true;
1792}
1793
1794bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1795 SIAtomicScope Scope,
1796 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1797 bool IsCrossAddrSpaceOrdering,
1798 Position Pos, AtomicOrdering Order,
1799 bool AtomicsOnly) const {
1800 bool Changed = false;
1801
1802 MachineBasicBlock &MBB = *MI->getParent();
1803 DebugLoc DL = MI->getDebugLoc();
1804
1805 bool LOADCnt = false;
1806 bool DSCnt = false;
1807 bool STORECnt = false;
1808
1809 if (Pos == Position::AFTER)
1810 ++MI;
1811
1812 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1813 SIAtomicAddrSpace::NONE) {
1814 switch (Scope) {
1815 case SIAtomicScope::SYSTEM:
1816 case SIAtomicScope::AGENT:
1817 case SIAtomicScope::CLUSTER:
1818 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1819 LOADCnt |= true;
1820 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1821 STORECnt |= true;
1822 break;
1823 case SIAtomicScope::WORKGROUP:
1824 // GFX12.0:
1825 // In WGP mode the waves of a work-group can be executing on either CU
1826 // of the WGP. Therefore need to wait for operations to complete to
1827 // ensure they are visible to waves in the other CU as the L0 is per CU.
1828 //
1829 // Otherwise in CU mode and all waves of a work-group are on the same CU
1830 // which shares the same L0. Note that we still need to wait when
1831 // performing a release in this mode to respect the transitivity of
1832 // happens-before, e.g. other waves of the workgroup must be able to
1833 // release the memory from another wave at a wider scope.
1834 //
1835 // GFX12.5:
1836 // CU$ has two ports. To ensure operations are visible at the workgroup
1837 // level, we need to ensure all operations in this port have completed
1838 // so the other SIMDs in the WG can see them. There is no ordering
1839 // guarantee between the ports.
1840 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1841 isReleaseOrStronger(Order)) {
1842 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1843 LOADCnt |= true;
1844 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1845 STORECnt |= true;
1846 }
1847 break;
1848 case SIAtomicScope::WAVEFRONT:
1849 case SIAtomicScope::SINGLETHREAD:
1850 // The L0 cache keeps all memory operations in order for
1851 // work-items in the same wavefront.
1852 break;
1853 default:
1854 llvm_unreachable("Unsupported synchronization scope");
1855 }
1856 }
1857
1858 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1859 switch (Scope) {
1860 case SIAtomicScope::SYSTEM:
1861 case SIAtomicScope::AGENT:
1862 case SIAtomicScope::CLUSTER:
1863 case SIAtomicScope::WORKGROUP:
1864 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1865 // not needed as LDS operations for all waves are executed in a total
1866 // global ordering as observed by all waves. Required if also
1867 // synchronizing with global/GDS memory as LDS operations could be
1868 // reordered with respect to later global/GDS memory operations of the
1869 // same wave.
1870 DSCnt |= IsCrossAddrSpaceOrdering;
1871 break;
1872 case SIAtomicScope::WAVEFRONT:
1873 case SIAtomicScope::SINGLETHREAD:
1874 // The LDS keeps all memory operations in order for
1875 // the same wavefront.
1876 break;
1877 default:
1878 llvm_unreachable("Unsupported synchronization scope");
1879 }
1880 }
1881
1882 if (LOADCnt) {
1883 // Acquire sequences only need to wait on the previous atomic operation.
1884 // e.g. a typical sequence looks like
1885 // atomic load
1886 // (wait)
1887 // global_inv
1888 //
1889 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1890 // to be tracked using loadcnt.
1891 //
1892 // This also applies to fences. Fences cannot pair with an instruction
1893 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1894 if (!AtomicsOnly && ST.hasImageInsts()) {
1895 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
1896 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
1897 }
1898 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
1899 Changed = true;
1900 }
1901
1902 if (STORECnt) {
1903 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
1904 Changed = true;
1905 }
1906
1907 if (DSCnt) {
1908 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
1909 Changed = true;
1910 }
1911
1912 if (Pos == Position::AFTER)
1913 --MI;
1914
1915 return Changed;
1916}
1917
1918bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1919 SIAtomicScope Scope,
1920 SIAtomicAddrSpace AddrSpace,
1921 Position Pos) const {
1922 if (!InsertCacheInv)
1923 return false;
1924
1925 MachineBasicBlock &MBB = *MI->getParent();
1926 DebugLoc DL = MI->getDebugLoc();
1927
1928 /// The scratch address space does not need the global memory cache
1929 /// to be flushed as all memory operations by the same thread are
1930 /// sequentially consistent, and no other thread can access scratch
1931 /// memory.
1932
1933 /// Other address spaces do not have a cache.
1934 if (!canAffectGlobalAddrSpace(AddrSpace))
1935 return false;
1936
1938 switch (Scope) {
1939 case SIAtomicScope::SYSTEM:
1940 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
1941 break;
1942 case SIAtomicScope::AGENT:
1943 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1944 break;
1945 case SIAtomicScope::CLUSTER:
1946 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1947 break;
1948 case SIAtomicScope::WORKGROUP:
1949 // GFX12.0:
1950 // In WGP mode the waves of a work-group can be executing on either CU of
1951 // the WGP. Therefore we need to invalidate the L0 which is per CU.
1952 // Otherwise in CU mode all waves of a work-group are on the same CU, and
1953 // so the L0 does not need to be invalidated.
1954 //
1955 // GFX12.5 has a shared WGP$, so no invalidates are required.
1956 if (ST.isCuModeEnabled())
1957 return false;
1958
1959 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1960 break;
1961 case SIAtomicScope::WAVEFRONT:
1962 case SIAtomicScope::SINGLETHREAD:
1963 // No cache to invalidate.
1964 return false;
1965 default:
1966 llvm_unreachable("Unsupported synchronization scope");
1967 }
1968
1969 if (Pos == Position::AFTER)
1970 ++MI;
1971
1972 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
1973
1974 if (Pos == Position::AFTER)
1975 --MI;
1976
1977 return true;
1978}
1979
1980bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1981 SIAtomicScope Scope,
1982 SIAtomicAddrSpace AddrSpace,
1983 bool IsCrossAddrSpaceOrdering,
1984 Position Pos) const {
1985 bool Changed = false;
1986
1987 MachineBasicBlock &MBB = *MI->getParent();
1988 DebugLoc DL = MI->getDebugLoc();
1989
1990 // The scratch address space does not need the global memory cache
1991 // writeback as all memory operations by the same thread are
1992 // sequentially consistent, and no other thread can access scratch
1993 // memory.
1994 if (canAffectGlobalAddrSpace(AddrSpace)) {
1995 if (Pos == Position::AFTER)
1996 ++MI;
1997
1998 // global_wb is only necessary at system scope for GFX12.0,
1999 // they're also necessary at device scope for GFX12.5 as stores
2000 // cannot report completion earlier than L2.
2001 //
2002 // Emitting it for lower scopes is a slow no-op, so we omit it
2003 // for performance.
2004 switch (Scope) {
2005 case SIAtomicScope::SYSTEM:
2006 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2008 Changed = true;
2009 break;
2010 case SIAtomicScope::AGENT:
2011 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2012 if (ST.hasGFX1250Insts()) {
2013 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2015 Changed = true;
2016 }
2017 break;
2018 case SIAtomicScope::CLUSTER:
2019 case SIAtomicScope::WORKGROUP:
2020 // No WB necessary, but we still have to wait.
2021 case SIAtomicScope::WAVEFRONT:
2022 case SIAtomicScope::SINGLETHREAD:
2023 // No WB or wait necessary here, but insertWait takes care of that.
2024 break;
2025 default:
2026 llvm_unreachable("Unsupported synchronization scope");
2027 }
2028
2029 if (Pos == Position::AFTER)
2030 --MI;
2031 }
2032
2033 // We always have to wait for previous memory operations (load/store) to
2034 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2035 // we of course need to wait for that as well.
2036 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2037 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2038 /*AtomicsOnly=*/false);
2039
2040 return Changed;
2041}
2042
2043bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2044 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2045 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2046
2047 // Only handle load and store, not atomic read-modify-write instructions.
2048 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2049
2050 // Only update load and store, not LLVM IR atomic read-modify-write
2051 // instructions. The latter are always marked as volatile so cannot sensibly
2052 // handle it as do not want to pessimize all atomics. Also they do not support
2053 // the nontemporal attribute.
2054 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2055
2056 bool Changed = false;
2057
2058 if (IsLastUse) {
2059 // Set last-use hint.
2060 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2061 } else if (IsNonTemporal) {
2062 // Set non-temporal hint for all cache levels.
2063 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2064 }
2065
2066 if (IsVolatile) {
2067 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2068
2069 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2071 MachineBasicBlock &MBB = *MI->getParent();
2072 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2073 Changed = true;
2074 }
2075
2076 // Ensure operation has completed at system scope to cause all volatile
2077 // operations to be visible outside the program in a global order. Do not
2078 // request cross address space as only the global address space can be
2079 // observable outside the program, so no need to cause a waitcnt for LDS
2080 // address space operations.
2081 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2082 Position::AFTER, AtomicOrdering::Unordered,
2083 /*AtomicsOnly=*/false);
2084 }
2085
2086 return Changed;
2087}
2088
2089bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2090 assert(MI.mayStore() && "Not a Store inst");
2091 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2092 bool Changed = false;
2093
2094 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2096 MachineBasicBlock &MBB = *MI.getParent();
2097 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2098 Changed = true;
2099 }
2100
2101 // Remaining fixes do not apply to RMWs.
2102 if (IsRMW)
2103 return Changed;
2104
2105 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2106 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2107 return Changed;
2108 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2109
2110 // GFX12.0 only: Extra waits needed before system scope stores.
2111 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2112 Scope == CPol::SCOPE_SYS)
2113 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2114
2115 return Changed;
2116}
2117
2118bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2119 if (!ST.hasGFX1250Insts())
2120 return false;
2121
2122 // Cooperative atomics need to be SCOPE_DEV or higher.
2123 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2124 assert(CPol && "No CPol operand?");
2125 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2126 if (Scope < CPol::SCOPE_DEV)
2127 return setScope(MI, CPol::SCOPE_DEV);
2128 return false;
2129}
2130
2131bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2132 SIAtomicScope Scope,
2133 SIAtomicAddrSpace AddrSpace) const {
2134 bool Changed = false;
2135
2136 if (canAffectGlobalAddrSpace(AddrSpace)) {
2137 switch (Scope) {
2138 case SIAtomicScope::SYSTEM:
2139 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2140 break;
2141 case SIAtomicScope::AGENT:
2142 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2143 break;
2144 case SIAtomicScope::CLUSTER:
2145 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2146 break;
2147 case SIAtomicScope::WORKGROUP:
2148 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2149 // different CUs that access different L0s.
2150 if (!ST.isCuModeEnabled())
2151 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2152 break;
2153 case SIAtomicScope::WAVEFRONT:
2154 case SIAtomicScope::SINGLETHREAD:
2155 // No cache to bypass.
2156 break;
2157 default:
2158 llvm_unreachable("Unsupported synchronization scope");
2159 }
2160 }
2161
2162 // The scratch address space does not need the global memory caches
2163 // to be bypassed as all memory operations by the same thread are
2164 // sequentially consistent, and no other thread can access scratch
2165 // memory.
2166
2167 // Other address spaces do not have a cache.
2168
2169 return Changed;
2170}
2171
2172bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2173 if (AtomicPseudoMIs.empty())
2174 return false;
2175
2176 for (auto &MI : AtomicPseudoMIs)
2177 MI->eraseFromParent();
2178
2179 AtomicPseudoMIs.clear();
2180 return true;
2181}
2182
2183bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2185 assert(MI->mayLoad() && !MI->mayStore());
2186
2187 bool Changed = false;
2188
2189 if (MOI.isAtomic()) {
2190 const AtomicOrdering Order = MOI.getOrdering();
2191 if (Order == AtomicOrdering::Monotonic ||
2192 Order == AtomicOrdering::Acquire ||
2193 Order == AtomicOrdering::SequentiallyConsistent) {
2194 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2195 MOI.getOrderingAddrSpace());
2196 }
2197
2198 // Handle cooperative atomics after cache bypass step, as it may override
2199 // the scope of the instruction to a greater scope.
2200 if (MOI.isCooperative())
2201 Changed |= CC->handleCooperativeAtomic(*MI);
2202
2203 if (Order == AtomicOrdering::SequentiallyConsistent)
2204 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2205 SIMemOp::LOAD | SIMemOp::STORE,
2206 MOI.getIsCrossAddressSpaceOrdering(),
2207 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2208
2209 if (Order == AtomicOrdering::Acquire ||
2210 Order == AtomicOrdering::SequentiallyConsistent) {
2211 // The wait below only needs to wait on the prior atomic.
2212 Changed |=
2213 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2214 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2215 Position::AFTER, Order, /*AtomicsOnly=*/true);
2216 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2217 MOI.getOrderingAddrSpace(),
2218 Position::AFTER);
2219 }
2220
2221 return Changed;
2222 }
2223
2224 // Atomic instructions already bypass caches to the scope specified by the
2225 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2226 // instructions need additional treatment.
2227 Changed |= CC->enableVolatileAndOrNonTemporal(
2228 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2229 MOI.isNonTemporal(), MOI.isLastUse());
2230
2231 return Changed;
2232}
2233
2234bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2236 assert(!MI->mayLoad() && MI->mayStore());
2237
2238 bool Changed = false;
2239 // FIXME: Necessary hack because iterator can lose track of the store.
2240 MachineInstr &StoreMI = *MI;
2241
2242 if (MOI.isAtomic()) {
2243 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2244 MOI.getOrdering() == AtomicOrdering::Release ||
2245 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2246 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2247 MOI.getOrderingAddrSpace());
2248 }
2249
2250 // Handle cooperative atomics after cache bypass step, as it may override
2251 // the scope of the instruction to a greater scope.
2252 if (MOI.isCooperative())
2253 Changed |= CC->handleCooperativeAtomic(*MI);
2254
2255 if (MOI.getOrdering() == AtomicOrdering::Release ||
2256 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2257 Changed |= CC->insertRelease(MI, MOI.getScope(),
2258 MOI.getOrderingAddrSpace(),
2259 MOI.getIsCrossAddressSpaceOrdering(),
2260 Position::BEFORE);
2261
2262 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2263 return Changed;
2264 }
2265
2266 // Atomic instructions already bypass caches to the scope specified by the
2267 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2268 // need additional treatment.
2269 Changed |= CC->enableVolatileAndOrNonTemporal(
2270 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2271 MOI.isNonTemporal());
2272
2273 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2274 // instruction field, do not confuse it with atomic scope.
2275 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2276 return Changed;
2277}
2278
2279bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2281 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2282
2283 AtomicPseudoMIs.push_back(MI);
2284 bool Changed = false;
2285
2286 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2287
2288 if (MOI.isAtomic()) {
2289 const AtomicOrdering Order = MOI.getOrdering();
2290 if (Order == AtomicOrdering::Acquire) {
2291 // Acquire fences only need to wait on the previous atomic they pair with.
2292 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2293 SIMemOp::LOAD | SIMemOp::STORE,
2294 MOI.getIsCrossAddressSpaceOrdering(),
2295 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2296 }
2297
2298 if (Order == AtomicOrdering::Release ||
2299 Order == AtomicOrdering::AcquireRelease ||
2300 Order == AtomicOrdering::SequentiallyConsistent)
2301 /// TODO: This relies on a barrier always generating a waitcnt
2302 /// for LDS to ensure it is not reordered with the completion of
2303 /// the proceeding LDS operations. If barrier had a memory
2304 /// ordering and memory scope, then library does not need to
2305 /// generate a fence. Could add support in this file for
2306 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2307 /// adding S_WAITCNT before a S_BARRIER.
2308 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2309 MOI.getIsCrossAddressSpaceOrdering(),
2310 Position::BEFORE);
2311
2312 // TODO: If both release and invalidate are happening they could be combined
2313 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2314 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2315 // track cache invalidate and write back instructions.
2316
2317 if (Order == AtomicOrdering::Acquire ||
2318 Order == AtomicOrdering::AcquireRelease ||
2319 Order == AtomicOrdering::SequentiallyConsistent)
2320 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2321 Position::BEFORE);
2322
2323 return Changed;
2324 }
2325
2326 return Changed;
2327}
2328
2329bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2331 assert(MI->mayLoad() && MI->mayStore());
2332
2333 bool Changed = false;
2334 MachineInstr &RMWMI = *MI;
2335
2336 if (MOI.isAtomic()) {
2337 const AtomicOrdering Order = MOI.getOrdering();
2338 if (Order == AtomicOrdering::Monotonic ||
2339 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2340 Order == AtomicOrdering::AcquireRelease ||
2341 Order == AtomicOrdering::SequentiallyConsistent) {
2342 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2343 MOI.getInstrAddrSpace());
2344 }
2345
2346 if (Order == AtomicOrdering::Release ||
2347 Order == AtomicOrdering::AcquireRelease ||
2348 Order == AtomicOrdering::SequentiallyConsistent ||
2349 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2350 Changed |= CC->insertRelease(MI, MOI.getScope(),
2351 MOI.getOrderingAddrSpace(),
2352 MOI.getIsCrossAddressSpaceOrdering(),
2353 Position::BEFORE);
2354
2355 if (Order == AtomicOrdering::Acquire ||
2356 Order == AtomicOrdering::AcquireRelease ||
2357 Order == AtomicOrdering::SequentiallyConsistent ||
2358 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2359 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2360 // Only wait on the previous atomic.
2361 Changed |=
2362 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2363 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2364 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2365 Order, /*AtomicsOnly=*/true);
2366 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2367 MOI.getOrderingAddrSpace(),
2368 Position::AFTER);
2369 }
2370
2371 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2372 return Changed;
2373 }
2374
2375 return Changed;
2376}
2377
2378bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2380 assert(MI->mayLoad() && MI->mayStore());
2381
2382 // The volatility or nontemporal-ness of the operation is a
2383 // function of the global memory, not the LDS.
2384 SIMemOp OpKind =
2385 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2386
2387 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2388 // stores. The operation is treated as a volatile/nontemporal store
2389 // to its second argument.
2390 return CC->enableVolatileAndOrNonTemporal(
2391 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2392 MOI.isNonTemporal(), MOI.isLastUse());
2393}
2394
2395bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2396 const MachineModuleInfo &MMI =
2397 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2398 return SIMemoryLegalizer(MMI).run(MF);
2399}
2400
2401PreservedAnalyses
2405 .getCachedResult<MachineModuleAnalysis>(
2406 *MF.getFunction().getParent());
2407 assert(MMI && "MachineModuleAnalysis must be available");
2408 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2409 return PreservedAnalyses::all();
2411}
2412
2413bool SIMemoryLegalizer::run(MachineFunction &MF) {
2414 bool Changed = false;
2415
2416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2417 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2418 CC = SICacheControl::create(ST);
2419
2420 for (auto &MBB : MF) {
2421 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2422
2423 // Unbundle instructions after the post-RA scheduler.
2424 if (MI->isBundle() && MI->mayLoadOrStore()) {
2425 MachineBasicBlock::instr_iterator II(MI->getIterator());
2426 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2427 I != E && I->isBundledWithPred(); ++I) {
2428 I->unbundleFromPred();
2429 for (MachineOperand &MO : I->operands())
2430 if (MO.isReg())
2431 MO.setIsInternalRead(false);
2432 }
2433
2434 MI->eraseFromParent();
2435 MI = II->getIterator();
2436 }
2437
2438 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2439 continue;
2440
2441 if (const auto &MOI = MOA.getLoadInfo(MI)) {
2442 Changed |= expandLoad(*MOI, MI);
2443 } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2444 Changed |= expandStore(*MOI, MI);
2445 } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
2446 Changed |= expandLDSDMA(*MOI, MI);
2447 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
2448 Changed |= expandAtomicFence(*MOI, MI);
2449 } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
2450 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2451 }
2452 }
2453 }
2454
2455 Changed |= removeAtomicPseudoMIs();
2456 return Changed;
2457}
2458
2459INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2460
2461char SIMemoryLegalizerLegacy::ID = 0;
2462char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2463
2465 return new SIMemoryLegalizerLegacy();
2466}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST)
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isVMEM(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isBUF(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isAtomic(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
bool isGFX10(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()