LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
31
32using namespace llvm;
33using namespace llvm::AMDGPU;
34
35#define DEBUG_TYPE "si-memory-legalizer"
36#define PASS_NAME "SI Memory Legalizer"
37
39 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
40 cl::desc("Use this to skip inserting cache invalidating instructions."));
41
42namespace {
43
45
46/// Memory operation flags. Can be ORed together.
47enum class SIMemOp {
48 NONE = 0u,
49 LOAD = 1u << 0,
50 STORE = 1u << 1,
51 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
52};
53
54/// Position to insert a new instruction relative to an existing
55/// instruction.
56enum class Position {
57 BEFORE,
58 AFTER
59};
60
61/// The atomic synchronization scopes supported by the AMDGPU target.
62enum class SIAtomicScope {
63 NONE,
64 SINGLETHREAD,
65 WAVEFRONT,
66 WORKGROUP,
67 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68 AGENT,
69 SYSTEM
70};
71
72/// The distinct address spaces supported by the AMDGPU target for
73/// atomic memory operation. Can be ORed together.
74enum class SIAtomicAddrSpace {
75 NONE = 0u,
76 GLOBAL = 1u << 0,
77 LDS = 1u << 1,
78 SCRATCH = 1u << 2,
79 GDS = 1u << 3,
80 OTHER = 1u << 4,
81
82 /// The address spaces that can be accessed by a FLAT instruction.
83 FLAT = GLOBAL | LDS | SCRATCH,
84
85 /// The address spaces that support atomic instructions.
86 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
87
88 /// All address spaces.
89 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
90
91 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
92};
93
94class SIMemOpInfo final {
95private:
96
97 friend class SIMemOpAccess;
98
99 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104 bool IsCrossAddressSpaceOrdering = false;
105 bool IsVolatile = false;
106 bool IsNonTemporal = false;
107 bool IsLastUse = false;
108 bool IsCooperative = false;
109
110 // TODO: Should we assume Cooperative=true if no MMO is present?
111 SIMemOpInfo(
112 const GCNSubtarget &ST,
113 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117 bool IsCrossAddressSpaceOrdering = true,
118 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119 bool IsVolatile = false, bool IsNonTemporal = false,
120 bool IsLastUse = false, bool IsCooperative = false)
121 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127 if (Ordering == AtomicOrdering::NotAtomic) {
128 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129 assert(Scope == SIAtomicScope::NONE &&
130 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131 !IsCrossAddressSpaceOrdering &&
132 FailureOrdering == AtomicOrdering::NotAtomic);
133 return;
134 }
135
136 assert(Scope != SIAtomicScope::NONE &&
137 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE &&
139 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140 SIAtomicAddrSpace::NONE);
141
142 // There is also no cross address space ordering if the ordering
143 // address space is the same as the instruction address space and
144 // only contains a single address space.
145 if ((OrderingAddrSpace == InstrAddrSpace) &&
146 isPowerOf2_32(uint32_t(InstrAddrSpace)))
147 this->IsCrossAddressSpaceOrdering = false;
148
149 // Limit the scope to the maximum supported by the instruction's address
150 // spaces.
151 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152 SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
154 } else if ((InstrAddrSpace &
155 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
156 SIAtomicAddrSpace::NONE) {
157 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
158 } else if ((InstrAddrSpace &
159 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
160 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
162 }
163
164 // On targets that have no concept of a workgroup cluster, use
165 // AGENT scope as a conservatively correct alternative.
166 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167 this->Scope = SIAtomicScope::AGENT;
168 }
169
170public:
171 /// \returns Atomic synchronization scope of the machine instruction used to
172 /// create this SIMemOpInfo.
173 SIAtomicScope getScope() const {
174 return Scope;
175 }
176
177 /// \returns Ordering constraint of the machine instruction used to
178 /// create this SIMemOpInfo.
179 AtomicOrdering getOrdering() const {
180 return Ordering;
181 }
182
183 /// \returns Failure ordering constraint of the machine instruction used to
184 /// create this SIMemOpInfo.
185 AtomicOrdering getFailureOrdering() const {
186 return FailureOrdering;
187 }
188
189 /// \returns The address spaces be accessed by the machine
190 /// instruction used to create this SIMemOpInfo.
191 SIAtomicAddrSpace getInstrAddrSpace() const {
192 return InstrAddrSpace;
193 }
194
195 /// \returns The address spaces that must be ordered by the machine
196 /// instruction used to create this SIMemOpInfo.
197 SIAtomicAddrSpace getOrderingAddrSpace() const {
198 return OrderingAddrSpace;
199 }
200
201 /// \returns Return true iff memory ordering of operations on
202 /// different address spaces is required.
203 bool getIsCrossAddressSpaceOrdering() const {
204 return IsCrossAddressSpaceOrdering;
205 }
206
207 /// \returns True if memory access of the machine instruction used to
208 /// create this SIMemOpInfo is volatile, false otherwise.
209 bool isVolatile() const {
210 return IsVolatile;
211 }
212
213 /// \returns True if memory access of the machine instruction used to
214 /// create this SIMemOpInfo is nontemporal, false otherwise.
215 bool isNonTemporal() const {
216 return IsNonTemporal;
217 }
218
219 /// \returns True if memory access of the machine instruction used to
220 /// create this SIMemOpInfo is last use, false otherwise.
221 bool isLastUse() const { return IsLastUse; }
222
223 /// \returns True if this is a cooperative load or store atomic.
224 bool isCooperative() const { return IsCooperative; }
225
226 /// \returns True if ordering constraint of the machine instruction used to
227 /// create this SIMemOpInfo is unordered or higher, false otherwise.
228 bool isAtomic() const {
229 return Ordering != AtomicOrdering::NotAtomic;
230 }
231
232};
233
234class SIMemOpAccess final {
235private:
236 const AMDGPUMachineModuleInfo *MMI = nullptr;
237 const GCNSubtarget &ST;
238
239 /// Reports unsupported message \p Msg for \p MI to LLVM context.
240 void reportUnsupported(const MachineBasicBlock::iterator &MI,
241 const char *Msg) const;
242
243 /// Inspects the target synchronization scope \p SSID and determines
244 /// the SI atomic scope it corresponds to, the address spaces it
245 /// covers, and whether the memory ordering applies between address
246 /// spaces.
247 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250 /// \return Return a bit set of the address spaces accessed by \p AS.
251 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253 /// \returns Info constructed from \p MI, which has at least machine memory
254 /// operand.
255 std::optional<SIMemOpInfo>
256 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258public:
259 /// Construct class to support accessing the machine memory operands
260 /// of instructions in the machine function \p MF.
261 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
266
267 /// \returns Store info if \p MI is a store operation, "std::nullopt"
268 /// otherwise.
269 std::optional<SIMemOpInfo>
270 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272 /// \returns Atomic fence info if \p MI is an atomic fence operation,
273 /// "std::nullopt" otherwise.
274 std::optional<SIMemOpInfo>
275 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278 /// rmw operation, "std::nullopt" otherwise.
279 std::optional<SIMemOpInfo>
280 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283 /// along with an indication of whether this is a load or store. If it is not
284 /// a direct-to-LDS operation, returns std::nullopt.
285 std::optional<SIMemOpInfo>
286 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287};
288
289class SICacheControl {
290protected:
291
292 /// AMDGPU subtarget info.
293 const GCNSubtarget &ST;
294
295 /// Instruction info.
296 const SIInstrInfo *TII = nullptr;
297
298 IsaVersion IV;
299
300 /// Whether to insert cache invalidating instructions.
301 bool InsertCacheInv;
302
303 SICacheControl(const GCNSubtarget &ST);
304
305 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
306 /// \returns Returns true if \p MI is modified, false otherwise.
307 bool enableCPolBits(const MachineBasicBlock::iterator MI,
308 unsigned Bits) const;
309
310 /// Check if any atomic operation on AS can affect memory accessible via the
311 /// global address space.
312 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314public:
315 using CPol = AMDGPU::CPol::CPol;
316
317 /// Create a cache control for the subtarget \p ST.
318 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
319
320 /// Update \p MI memory load instruction to bypass any caches up to
321 /// the \p Scope memory scope for address spaces \p
322 /// AddrSpace. Return true iff the instruction was modified.
323 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
324 SIAtomicScope Scope,
325 SIAtomicAddrSpace AddrSpace) const = 0;
326
327 /// Update \p MI memory store instruction to bypass any caches up to
328 /// the \p Scope memory scope for address spaces \p
329 /// AddrSpace. Return true iff the instruction was modified.
330 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
331 SIAtomicScope Scope,
332 SIAtomicAddrSpace AddrSpace) const = 0;
333
334 /// Update \p MI memory read-modify-write instruction to bypass any caches up
335 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
336 /// iff the instruction was modified.
337 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
338 SIAtomicScope Scope,
339 SIAtomicAddrSpace AddrSpace) const = 0;
340
341 /// Update \p MI memory instruction of kind \p Op associated with address
342 /// spaces \p AddrSpace to indicate it is volatile and/or
343 /// nontemporal/last-use. Return true iff the instruction was modified.
344 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345 SIAtomicAddrSpace AddrSpace,
346 SIMemOp Op, bool IsVolatile,
347 bool IsNonTemporal,
348 bool IsLastUse = false) const = 0;
349
350 /// Add final touches to a `mayStore` instruction \p MI, which may be a
351 /// Store or RMW instruction.
352 /// FIXME: This takes a MI because iterators aren't handled properly. When
353 /// this is called, they often point to entirely different insts. Thus we back
354 /// up the inst early and pass it here instead.
355 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
356 return false;
357 };
358
359 /// Handle cooperative load/store atomics.
360 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
362 "cooperative atomics are not available on this architecture");
363 }
364
365 /// Inserts any necessary instructions at position \p Pos relative
366 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
367 /// \p Op associated with address spaces \p AddrSpace have completed. Used
368 /// between memory instructions to enforce the order they become visible as
369 /// observed by other memory instructions executing in memory scope \p Scope.
370 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
371 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
372 /// that are used by atomic instructions.
373 /// Returns true iff any instructions inserted.
374 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
375 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
376 bool IsCrossAddrSpaceOrdering, Position Pos,
377 AtomicOrdering Order, bool AtomicsOnly) const = 0;
378
379 /// Inserts any necessary instructions at position \p Pos relative to
380 /// instruction \p MI to ensure any subsequent memory instructions of this
381 /// thread with address spaces \p AddrSpace will observe the previous memory
382 /// operations by any thread for memory scopes up to memory scope \p Scope .
383 /// Returns true iff any instructions inserted.
384 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
385 SIAtomicScope Scope,
386 SIAtomicAddrSpace AddrSpace,
387 Position Pos) const = 0;
388
389 /// Inserts any necessary instructions at position \p Pos relative to
390 /// instruction \p MI to ensure previous memory instructions by this thread
391 /// with address spaces \p AddrSpace have completed and can be observed by
392 /// subsequent memory instructions by any thread executing in memory scope \p
393 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
394 /// between address spaces. Returns true iff any instructions inserted.
395 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
396 SIAtomicScope Scope,
397 SIAtomicAddrSpace AddrSpace,
398 bool IsCrossAddrSpaceOrdering,
399 Position Pos) const = 0;
400
401 /// Virtual destructor to allow derivations to be deleted.
402 virtual ~SICacheControl() = default;
403};
404
405/// Generates code sequences for the memory model of all GFX targets below
406/// GFX10.
407class SIGfx6CacheControl final : public SICacheControl {
408public:
409
410 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
411
412 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
413 SIAtomicScope Scope,
414 SIAtomicAddrSpace AddrSpace) const override;
415
416 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
417 SIAtomicScope Scope,
418 SIAtomicAddrSpace AddrSpace) const override;
419
420 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace) const override;
423
424 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
425 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
426 bool IsVolatile, bool IsNonTemporal,
427 bool IsLastUse) const override;
428
429 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
430 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
431 bool IsCrossAddrSpaceOrdering, Position Pos,
432 AtomicOrdering Order, bool AtomicsOnly) const override;
433
434 bool insertAcquire(MachineBasicBlock::iterator &MI,
435 SIAtomicScope Scope,
436 SIAtomicAddrSpace AddrSpace,
437 Position Pos) const override;
438
439 bool insertRelease(MachineBasicBlock::iterator &MI,
440 SIAtomicScope Scope,
441 SIAtomicAddrSpace AddrSpace,
442 bool IsCrossAddrSpaceOrdering,
443 Position Pos) const override;
444};
445
446/// Generates code sequences for the memory model of GFX10/11.
447class SIGfx10CacheControl final : public SICacheControl {
448public:
449 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
450
451 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace) const override;
454
455 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
456 SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace) const override {
458 return false;
459 }
460
461 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace) const override {
464 return false;
465 }
466
467 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
468 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
469 bool IsVolatile, bool IsNonTemporal,
470 bool IsLastUse) const override;
471
472 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
473 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
474 bool IsCrossAddrSpaceOrdering, Position Pos,
475 AtomicOrdering Order, bool AtomicsOnly) const override;
476
477 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
478 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
479
480 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
481 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
482 Position Pos) const override {
483 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
484 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
485 /*AtomicsOnly=*/false);
486 }
487};
488
489class SIGfx12CacheControl final : public SICacheControl {
490protected:
491 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
492 // \returns Returns true if \p MI is modified, false otherwise.
493 bool setTH(const MachineBasicBlock::iterator MI,
495
496 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
497 // MI. \returns Returns true if \p MI is modified, false otherwise.
498 bool setScope(const MachineBasicBlock::iterator MI,
500
501 // Stores with system scope (SCOPE_SYS) need to wait for:
502 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
503 // - non-returning-atomics - wait for STORECNT==0
504 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
505 // since it does not distinguish atomics-with-return from regular stores.
506 // There is no need to wait if memory is cached (mtype != UC).
507 bool
508 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
509
510 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
511 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
512
513public:
514 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
515 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
516 // the behavior is the same if assuming GFX12.0 in CU mode.
517 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
518 }
519
520 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
521 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
522 bool IsCrossAddrSpaceOrdering, Position Pos,
523 AtomicOrdering Order, bool AtomicsOnly) const override;
524
525 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
526 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
527
528 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
529 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
530 bool IsVolatile, bool IsNonTemporal,
531 bool IsLastUse) const override;
532
533 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
534
535 bool handleCooperativeAtomic(MachineInstr &MI) const override;
536
537 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
539 Position Pos) const override;
540
541 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
542 SIAtomicScope Scope,
543 SIAtomicAddrSpace AddrSpace) const override {
544 return setAtomicScope(MI, Scope, AddrSpace);
545 }
546
547 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
548 SIAtomicScope Scope,
549 SIAtomicAddrSpace AddrSpace) const override {
550 return setAtomicScope(MI, Scope, AddrSpace);
551 }
552
553 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
554 SIAtomicScope Scope,
555 SIAtomicAddrSpace AddrSpace) const override {
556 return setAtomicScope(MI, Scope, AddrSpace);
557 }
558};
559
560class SIMemoryLegalizer final {
561private:
562 const MachineModuleInfo &MMI;
563 /// Cache Control.
564 std::unique_ptr<SICacheControl> CC = nullptr;
565
566 /// List of atomic pseudo instructions.
567 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
568
569 /// Return true iff instruction \p MI is a atomic instruction that
570 /// returns a result.
571 bool isAtomicRet(const MachineInstr &MI) const {
573 }
574
575 /// Removes all processed atomic pseudo instructions from the current
576 /// function. Returns true if current function is modified, false otherwise.
577 bool removeAtomicPseudoMIs();
578
579 /// Expands load operation \p MI. Returns true if instructions are
580 /// added/deleted or \p MI is modified, false otherwise.
581 bool expandLoad(const SIMemOpInfo &MOI,
583 /// Expands store operation \p MI. Returns true if instructions are
584 /// added/deleted or \p MI is modified, false otherwise.
585 bool expandStore(const SIMemOpInfo &MOI,
587 /// Expands atomic fence operation \p MI. Returns true if
588 /// instructions are added/deleted or \p MI is modified, false otherwise.
589 bool expandAtomicFence(const SIMemOpInfo &MOI,
591 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
592 /// instructions are added/deleted or \p MI is modified, false otherwise.
593 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
595 /// Expands LDS DMA operation \p MI. Returns true if instructions are
596 /// added/deleted or \p MI is modified, false otherwise.
597 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
598
599public:
600 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
601 bool run(MachineFunction &MF);
602};
603
604class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
605public:
606 static char ID;
607
608 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
609
610 void getAnalysisUsage(AnalysisUsage &AU) const override {
611 AU.setPreservesCFG();
613 }
614
615 StringRef getPassName() const override {
616 return PASS_NAME;
617 }
618
619 bool runOnMachineFunction(MachineFunction &MF) override;
620};
621
622static const StringMap<SIAtomicAddrSpace> ASNames = {{
623 {"global", SIAtomicAddrSpace::GLOBAL},
624 {"local", SIAtomicAddrSpace::LDS},
625}};
626
627void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
628 const MachineFunction *MF = MI.getMF();
629 const Function &Fn = MF->getFunction();
631 raw_svector_ostream OS(Str);
632 OS << "unknown address space '" << AS << "'; expected one of ";
634 for (const auto &[Name, Val] : ASNames)
635 OS << LS << '\'' << Name << '\'';
636 Fn.getContext().diagnose(
637 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
638}
639
640/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
641/// If this tag isn't present, or if it has no meaningful values, returns
642/// \p none, otherwise returns the address spaces specified by the MD.
643static std::optional<SIAtomicAddrSpace>
644getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
645 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
646
647 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
648 if (!MMRA)
649 return std::nullopt;
650
651 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
652 for (const auto &[Prefix, Suffix] : MMRA) {
653 if (Prefix != FenceASPrefix)
654 continue;
655
656 if (auto It = ASNames.find(Suffix); It != ASNames.end())
657 Result |= It->second;
658 else
659 diagnoseUnknownMMRAASName(MI, Suffix);
660 }
661
662 if (Result == SIAtomicAddrSpace::NONE)
663 return std::nullopt;
664
665 return Result;
666}
667
668} // end anonymous namespace
669
670void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
671 const char *Msg) const {
672 const Function &Func = MI->getMF()->getFunction();
673 Func.getContext().diagnose(
674 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
675}
676
677std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
678SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
679 SIAtomicAddrSpace InstrAddrSpace) const {
680 if (SSID == SyncScope::System)
681 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
682 if (SSID == MMI->getAgentSSID())
683 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
684 if (SSID == MMI->getClusterSSID())
685 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
686 if (SSID == MMI->getWorkgroupSSID())
687 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
688 true);
689 if (SSID == MMI->getWavefrontSSID())
690 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
691 true);
692 if (SSID == SyncScope::SingleThread)
693 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
694 true);
695 if (SSID == MMI->getSystemOneAddressSpaceSSID())
696 return std::tuple(SIAtomicScope::SYSTEM,
697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698 if (SSID == MMI->getAgentOneAddressSpaceSSID())
699 return std::tuple(SIAtomicScope::AGENT,
700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701 if (SSID == MMI->getClusterOneAddressSpaceSSID())
702 return std::tuple(SIAtomicScope::CLUSTER,
703 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
705 return std::tuple(SIAtomicScope::WORKGROUP,
706 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
708 return std::tuple(SIAtomicScope::WAVEFRONT,
709 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
711 return std::tuple(SIAtomicScope::SINGLETHREAD,
712 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713 return std::nullopt;
714}
715
716SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
717 if (AS == AMDGPUAS::FLAT_ADDRESS)
718 return SIAtomicAddrSpace::FLAT;
719 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
720 return SIAtomicAddrSpace::GLOBAL;
721 if (AS == AMDGPUAS::LOCAL_ADDRESS)
722 return SIAtomicAddrSpace::LDS;
724 return SIAtomicAddrSpace::SCRATCH;
725 if (AS == AMDGPUAS::REGION_ADDRESS)
726 return SIAtomicAddrSpace::GDS;
729 return SIAtomicAddrSpace::GLOBAL;
730
731 return SIAtomicAddrSpace::OTHER;
732}
733
734SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
735 const GCNSubtarget &ST)
736 : MMI(&MMI_), ST(ST) {}
737
738std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
739 const MachineBasicBlock::iterator &MI) const {
740 assert(MI->getNumMemOperands() > 0);
741
743 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
744 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
745 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746 bool IsNonTemporal = true;
747 bool IsVolatile = false;
748 bool IsLastUse = false;
749 bool IsCooperative = false;
750
751 // Validator should check whether or not MMOs cover the entire set of
752 // locations accessed by the memory instruction.
753 for (const auto &MMO : MI->memoperands()) {
754 IsNonTemporal &= MMO->isNonTemporal();
755 IsVolatile |= MMO->isVolatile();
756 IsLastUse |= MMO->getFlags() & MOLastUse;
757 IsCooperative |= MMO->getFlags() & MOCooperative;
758 InstrAddrSpace |=
759 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
760 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
761 if (OpOrdering != AtomicOrdering::NotAtomic) {
762 const auto &IsSyncScopeInclusion =
763 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
764 if (!IsSyncScopeInclusion) {
765 reportUnsupported(MI,
766 "Unsupported non-inclusive atomic synchronization scope");
767 return std::nullopt;
768 }
769
770 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
771 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
772 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
773 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
774 FailureOrdering =
775 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
776 }
777 }
778
779 SIAtomicScope Scope = SIAtomicScope::NONE;
780 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
781 bool IsCrossAddressSpaceOrdering = false;
782 if (Ordering != AtomicOrdering::NotAtomic) {
783 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
784 if (!ScopeOrNone) {
785 reportUnsupported(MI, "Unsupported atomic synchronization scope");
786 return std::nullopt;
787 }
788 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
789 *ScopeOrNone;
790 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
791 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
792 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
793 reportUnsupported(MI, "Unsupported atomic address space");
794 return std::nullopt;
795 }
796 }
797 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
798 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
799 IsNonTemporal, IsLastUse, IsCooperative);
800}
801
802std::optional<SIMemOpInfo>
803SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
804 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
805
806 if (!(MI->mayLoad() && !MI->mayStore()))
807 return std::nullopt;
808
809 // Be conservative if there are no memory operands.
810 if (MI->getNumMemOperands() == 0)
811 return SIMemOpInfo(ST);
812
813 return constructFromMIWithMMO(MI);
814}
815
816std::optional<SIMemOpInfo>
817SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
818 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
819
820 if (!(!MI->mayLoad() && MI->mayStore()))
821 return std::nullopt;
822
823 // Be conservative if there are no memory operands.
824 if (MI->getNumMemOperands() == 0)
825 return SIMemOpInfo(ST);
826
827 return constructFromMIWithMMO(MI);
828}
829
830std::optional<SIMemOpInfo>
831SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
832 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
833
834 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
835 return std::nullopt;
836
838 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
839
840 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
841 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
842 if (!ScopeOrNone) {
843 reportUnsupported(MI, "Unsupported atomic synchronization scope");
844 return std::nullopt;
845 }
846
847 SIAtomicScope Scope = SIAtomicScope::NONE;
848 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
849 bool IsCrossAddressSpaceOrdering = false;
850 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
851 *ScopeOrNone;
852
853 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
854 // We currently expect refineOrderingAS to be the only place that
855 // can refine the AS ordered by the fence.
856 // If that changes, we need to review the semantics of that function
857 // in case it needs to preserve certain address spaces.
858 reportUnsupported(MI, "Unsupported atomic address space");
859 return std::nullopt;
860 }
861
862 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
863 if (SynchronizeAS)
864 OrderingAddrSpace = *SynchronizeAS;
865
866 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
867 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
868 AtomicOrdering::NotAtomic);
869}
870
871std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
872 const MachineBasicBlock::iterator &MI) const {
873 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
874
875 if (!(MI->mayLoad() && MI->mayStore()))
876 return std::nullopt;
877
878 // Be conservative if there are no memory operands.
879 if (MI->getNumMemOperands() == 0)
880 return SIMemOpInfo(ST);
881
882 return constructFromMIWithMMO(MI);
883}
884
885std::optional<SIMemOpInfo>
886SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
887 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
888
890 return std::nullopt;
891
892 return constructFromMIWithMMO(MI);
893}
894
895SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
896 TII = ST.getInstrInfo();
897 IV = getIsaVersion(ST.getCPU());
898 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
899}
900
901bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
902 unsigned Bits) const {
903 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
904 if (!CPol)
905 return false;
906
907 CPol->setImm(CPol->getImm() | Bits);
908 return true;
909}
910
911bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
912 assert((!ST.hasGloballyAddressableScratch() ||
913 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
914 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
915 "scratch instructions should already be replaced by flat "
916 "instructions if GloballyAddressableScratch is enabled");
917 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
918}
919
920/* static */
921std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
922 GCNSubtarget::Generation Generation = ST.getGeneration();
923 if (Generation < AMDGPUSubtarget::GFX10)
924 return std::make_unique<SIGfx6CacheControl>(ST);
925 if (Generation < AMDGPUSubtarget::GFX12)
926 return std::make_unique<SIGfx10CacheControl>(ST);
927 return std::make_unique<SIGfx12CacheControl>(ST);
928}
929
930bool SIGfx6CacheControl::enableLoadCacheBypass(
932 SIAtomicScope Scope,
933 SIAtomicAddrSpace AddrSpace) const {
934 assert(MI->mayLoad() && !MI->mayStore());
935
936 if (!canAffectGlobalAddrSpace(AddrSpace)) {
937 /// The scratch address space does not need the global memory caches
938 /// to be bypassed as all memory operations by the same thread are
939 /// sequentially consistent, and no other thread can access scratch
940 /// memory.
941
942 /// Other address spaces do not have a cache.
943 return false;
944 }
945
946 bool Changed = false;
947 switch (Scope) {
948 case SIAtomicScope::SYSTEM:
949 if (ST.hasGFX940Insts()) {
950 // Set SC bits to indicate system scope.
951 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
952 break;
953 }
954 [[fallthrough]];
955 case SIAtomicScope::AGENT:
956 if (ST.hasGFX940Insts()) {
957 // Set SC bits to indicate agent scope.
958 Changed |= enableCPolBits(MI, CPol::SC1);
959 } else {
960 // Set L1 cache policy to MISS_EVICT.
961 // Note: there is no L2 cache bypass policy at the ISA level.
962 Changed |= enableCPolBits(MI, CPol::GLC);
963 }
964 break;
965 case SIAtomicScope::WORKGROUP:
966 if (ST.hasGFX940Insts()) {
967 // In threadgroup split mode the waves of a work-group can be executing
968 // on different CUs. Therefore need to bypass the L1 which is per CU.
969 // Otherwise in non-threadgroup split mode all waves of a work-group are
970 // on the same CU, and so the L1 does not need to be bypassed. Setting
971 // SC bits to indicate work-group scope will do this automatically.
972 Changed |= enableCPolBits(MI, CPol::SC0);
973 } else if (ST.hasGFX90AInsts()) {
974 // In threadgroup split mode the waves of a work-group can be executing
975 // on different CUs. Therefore need to bypass the L1 which is per CU.
976 // Otherwise in non-threadgroup split mode all waves of a work-group are
977 // on the same CU, and so the L1 does not need to be bypassed.
978 if (ST.isTgSplitEnabled())
979 Changed |= enableCPolBits(MI, CPol::GLC);
980 }
981 break;
982 case SIAtomicScope::WAVEFRONT:
983 case SIAtomicScope::SINGLETHREAD:
984 // No cache to bypass.
985 break;
986 default:
987 llvm_unreachable("Unsupported synchronization scope");
988 }
989
990 return Changed;
991}
992
993bool SIGfx6CacheControl::enableStoreCacheBypass(
995 SIAtomicScope Scope,
996 SIAtomicAddrSpace AddrSpace) const {
997 assert(!MI->mayLoad() && MI->mayStore());
998 bool Changed = false;
999
1000 /// For targets other than GFX940, the L1 cache is write through so does not
1001 /// need to be bypassed. There is no bypass control for the L2 cache at the
1002 /// isa level.
1003
1004 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1005 switch (Scope) {
1006 case SIAtomicScope::SYSTEM:
1007 // Set SC bits to indicate system scope.
1008 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1009 break;
1010 case SIAtomicScope::AGENT:
1011 // Set SC bits to indicate agent scope.
1012 Changed |= enableCPolBits(MI, CPol::SC1);
1013 break;
1014 case SIAtomicScope::WORKGROUP:
1015 // Set SC bits to indicate workgroup scope.
1016 Changed |= enableCPolBits(MI, CPol::SC0);
1017 break;
1018 case SIAtomicScope::WAVEFRONT:
1019 case SIAtomicScope::SINGLETHREAD:
1020 // Leave SC bits unset to indicate wavefront scope.
1021 break;
1022 default:
1023 llvm_unreachable("Unsupported synchronization scope");
1024 }
1025
1026 /// The scratch address space does not need the global memory caches
1027 /// to be bypassed as all memory operations by the same thread are
1028 /// sequentially consistent, and no other thread can access scratch
1029 /// memory.
1030
1031 /// Other address spaces do not have a cache.
1032 }
1033
1034 return Changed;
1035}
1036
1037bool SIGfx6CacheControl::enableRMWCacheBypass(
1039 SIAtomicScope Scope,
1040 SIAtomicAddrSpace AddrSpace) const {
1041 assert(MI->mayLoad() && MI->mayStore());
1042 bool Changed = false;
1043
1044 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1045 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1046 /// indicate if they are return or no-return. Note: there is no L2 cache
1047 /// coherent bypass control at the ISA level.
1048 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1049
1050 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1051 switch (Scope) {
1052 case SIAtomicScope::SYSTEM:
1053 // Set SC1 bit to indicate system scope.
1054 Changed |= enableCPolBits(MI, CPol::SC1);
1055 break;
1056 case SIAtomicScope::AGENT:
1057 case SIAtomicScope::WORKGROUP:
1058 case SIAtomicScope::WAVEFRONT:
1059 case SIAtomicScope::SINGLETHREAD:
1060 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1061 // to indicate system or agent scope. The SC0 bit is used to indicate if
1062 // they are return or no-return. Leave SC1 bit unset to indicate agent
1063 // scope.
1064 break;
1065 default:
1066 llvm_unreachable("Unsupported synchronization scope");
1067 }
1068 }
1069
1070 return Changed;
1071}
1072
1073bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1074 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1075 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1076 // Only handle load and store, not atomic read-modify-write insructions. The
1077 // latter use glc to indicate if the atomic returns a result and so must not
1078 // be used for cache control.
1079 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1080
1081 // Only update load and store, not LLVM IR atomic read-modify-write
1082 // instructions. The latter are always marked as volatile so cannot sensibly
1083 // handle it as do not want to pessimize all atomics. Also they do not support
1084 // the nontemporal attribute.
1085 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1086
1087 bool Changed = false;
1088
1089 if (IsVolatile) {
1090 if (ST.hasGFX940Insts()) {
1091 // Set SC bits to indicate system scope.
1092 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1093 } else if (Op == SIMemOp::LOAD) {
1094 // Set L1 cache policy to be MISS_EVICT for load instructions
1095 // and MISS_LRU for store instructions.
1096 // Note: there is no L2 cache bypass policy at the ISA level.
1097 Changed |= enableCPolBits(MI, CPol::GLC);
1098 }
1099
1100 // Ensure operation has completed at system scope to cause all volatile
1101 // operations to be visible outside the program in a global order. Do not
1102 // request cross address space as only the global address space can be
1103 // observable outside the program, so no need to cause a waitcnt for LDS
1104 // address space operations.
1105 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1106 Position::AFTER, AtomicOrdering::Unordered,
1107 /*AtomicsOnly=*/false);
1108
1109 return Changed;
1110 }
1111
1112 if (IsNonTemporal) {
1113 if (ST.hasGFX940Insts()) {
1114 Changed |= enableCPolBits(MI, CPol::NT);
1115 } else {
1116 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1117 // for both loads and stores, and the L2 cache policy to STREAM.
1118 Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
1119 }
1120 return Changed;
1121 }
1122
1123 return Changed;
1124}
1125
1126bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1127 SIAtomicScope Scope,
1128 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1129 bool IsCrossAddrSpaceOrdering, Position Pos,
1130 AtomicOrdering Order,
1131 bool AtomicsOnly) const {
1132 bool Changed = false;
1133
1134 MachineBasicBlock &MBB = *MI->getParent();
1135 DebugLoc DL = MI->getDebugLoc();
1136
1137 if (Pos == Position::AFTER)
1138 ++MI;
1139
1140 // GFX90A+
1141 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1142 // In threadgroup split mode the waves of a work-group can be executing on
1143 // different CUs. Therefore need to wait for global or GDS memory operations
1144 // to complete to ensure they are visible to waves in the other CUs.
1145 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1146 // the same CU, so no need to wait for global memory as all waves in the
1147 // work-group access the same the L1, nor wait for GDS as access are ordered
1148 // on a CU.
1149 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1150 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1151 (Scope == SIAtomicScope::WORKGROUP)) {
1152 // Same as <GFX90A at AGENT scope;
1153 Scope = SIAtomicScope::AGENT;
1154 }
1155 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1156 // LDS memory operations.
1157 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1158 }
1159
1160 bool VMCnt = false;
1161 bool LGKMCnt = false;
1162
1163 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1164 SIAtomicAddrSpace::NONE) {
1165 switch (Scope) {
1166 case SIAtomicScope::SYSTEM:
1167 case SIAtomicScope::AGENT:
1168 VMCnt |= true;
1169 break;
1170 case SIAtomicScope::WORKGROUP:
1171 case SIAtomicScope::WAVEFRONT:
1172 case SIAtomicScope::SINGLETHREAD:
1173 // The L1 cache keeps all memory operations in order for
1174 // wavefronts in the same work-group.
1175 break;
1176 default:
1177 llvm_unreachable("Unsupported synchronization scope");
1178 }
1179 }
1180
1181 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1182 switch (Scope) {
1183 case SIAtomicScope::SYSTEM:
1184 case SIAtomicScope::AGENT:
1185 case SIAtomicScope::WORKGROUP:
1186 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1187 // not needed as LDS operations for all waves are executed in a total
1188 // global ordering as observed by all waves. Required if also
1189 // synchronizing with global/GDS memory as LDS operations could be
1190 // reordered with respect to later global/GDS memory operations of the
1191 // same wave.
1192 LGKMCnt |= IsCrossAddrSpaceOrdering;
1193 break;
1194 case SIAtomicScope::WAVEFRONT:
1195 case SIAtomicScope::SINGLETHREAD:
1196 // The LDS keeps all memory operations in order for
1197 // the same wavefront.
1198 break;
1199 default:
1200 llvm_unreachable("Unsupported synchronization scope");
1201 }
1202 }
1203
1204 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1205 switch (Scope) {
1206 case SIAtomicScope::SYSTEM:
1207 case SIAtomicScope::AGENT:
1208 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1209 // is not needed as GDS operations for all waves are executed in a total
1210 // global ordering as observed by all waves. Required if also
1211 // synchronizing with global/LDS memory as GDS operations could be
1212 // reordered with respect to later global/LDS memory operations of the
1213 // same wave.
1214 LGKMCnt |= IsCrossAddrSpaceOrdering;
1215 break;
1216 case SIAtomicScope::WORKGROUP:
1217 case SIAtomicScope::WAVEFRONT:
1218 case SIAtomicScope::SINGLETHREAD:
1219 // The GDS keeps all memory operations in order for
1220 // the same work-group.
1221 break;
1222 default:
1223 llvm_unreachable("Unsupported synchronization scope");
1224 }
1225 }
1226
1227 if (VMCnt || LGKMCnt) {
1228 unsigned WaitCntImmediate =
1230 VMCnt ? 0 : getVmcntBitMask(IV),
1232 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1233 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1234 .addImm(WaitCntImmediate);
1235 Changed = true;
1236 }
1237
1238 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1239 // at workgroup-scoped release operations that specify the LDS address space.
1240 // SIInsertWaitcnts will later replace this with a vmcnt().
1241 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1242 Scope == SIAtomicScope::WORKGROUP &&
1243 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1244 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1245 Changed = true;
1246 }
1247
1248 if (Pos == Position::AFTER)
1249 --MI;
1250
1251 return Changed;
1252}
1253
1255 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1256 return false;
1257 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1258}
1259
1260bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1261 SIAtomicScope Scope,
1262 SIAtomicAddrSpace AddrSpace,
1263 Position Pos) const {
1264 if (!InsertCacheInv)
1265 return false;
1266
1267 bool Changed = false;
1268
1269 MachineBasicBlock &MBB = *MI->getParent();
1270 DebugLoc DL = MI->getDebugLoc();
1271
1272 if (Pos == Position::AFTER)
1273 ++MI;
1274
1275 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1276 ? AMDGPU::BUFFER_WBINVL1_VOL
1277 : AMDGPU::BUFFER_WBINVL1;
1278
1279 if (canAffectGlobalAddrSpace(AddrSpace)) {
1280 switch (Scope) {
1281 case SIAtomicScope::SYSTEM:
1282 if (ST.hasGFX940Insts()) {
1283 // Ensures that following loads will not see stale remote VMEM data or
1284 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1285 // and CC will never be stale due to the local memory probes.
1286 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1287 // Set SC bits to indicate system scope.
1289 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1290 // hardware does not reorder memory operations by the same wave with
1291 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1292 // remove any cache lines of earlier writes by the same wave and ensures
1293 // later reads by the same wave will refetch the cache lines.
1294 Changed = true;
1295 break;
1296 }
1297
1298 if (ST.hasGFX90AInsts()) {
1299 // Ensures that following loads will not see stale remote VMEM data or
1300 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1301 // and CC will never be stale due to the local memory probes.
1302 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1303 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1304 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1305 // hardware does not reorder memory operations by the same wave with
1306 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1307 // to remove any cache lines of earlier writes by the same wave and
1308 // ensures later reads by the same wave will refetch the cache lines.
1309 Changed = true;
1310 break;
1311 }
1312 [[fallthrough]];
1313 case SIAtomicScope::AGENT:
1314 if (ST.hasGFX940Insts()) {
1315 // Ensures that following loads will not see stale remote date or local
1316 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1317 // stale due to the memory probes.
1318 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1319 // Set SC bits to indicate agent scope.
1321 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1322 // does not reorder memory operations with respect to preceeding buffer
1323 // invalidate. The invalidate is guaranteed to remove any cache lines of
1324 // earlier writes and ensures later writes will refetch the cache lines.
1325 } else
1326 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1327 Changed = true;
1328 break;
1329 case SIAtomicScope::WORKGROUP:
1330 if (ST.isTgSplitEnabled()) {
1331 if (ST.hasGFX940Insts()) {
1332 // In threadgroup split mode the waves of a work-group can be
1333 // executing on different CUs. Therefore need to invalidate the L1
1334 // which is per CU. Otherwise in non-threadgroup split mode all waves
1335 // of a work-group are on the same CU, and so the L1 does not need to
1336 // be invalidated.
1337
1338 // Ensures L1 is invalidated if in threadgroup split mode. In
1339 // non-threadgroup split mode it is a NOP, but no point generating it
1340 // in that case if know not in that mode.
1341 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1342 // Set SC bits to indicate work-group scope.
1344 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1345 // does not reorder memory operations with respect to preceeding
1346 // buffer invalidate. The invalidate is guaranteed to remove any cache
1347 // lines of earlier writes and ensures later writes will refetch the
1348 // cache lines.
1349 Changed = true;
1350 } else if (ST.hasGFX90AInsts()) {
1351 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1352 Changed = true;
1353 }
1354 }
1355 break;
1356 case SIAtomicScope::WAVEFRONT:
1357 case SIAtomicScope::SINGLETHREAD:
1358 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1359 // there are no caches to invalidate. All other targets have no cache to
1360 // invalidate.
1361 break;
1362 default:
1363 llvm_unreachable("Unsupported synchronization scope");
1364 }
1365 }
1366
1367 /// The scratch address space does not need the global memory cache
1368 /// to be flushed as all memory operations by the same thread are
1369 /// sequentially consistent, and no other thread can access scratch
1370 /// memory.
1371
1372 /// Other address spaces do not have a cache.
1373
1374 if (Pos == Position::AFTER)
1375 --MI;
1376
1377 return Changed;
1378}
1379
1380bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1381 SIAtomicScope Scope,
1382 SIAtomicAddrSpace AddrSpace,
1383 bool IsCrossAddrSpaceOrdering,
1384 Position Pos) const {
1385 bool Changed = false;
1386
1387 if (ST.hasGFX90AInsts()) {
1388 MachineBasicBlock &MBB = *MI->getParent();
1389 const DebugLoc &DL = MI->getDebugLoc();
1390
1391 if (Pos == Position::AFTER)
1392 ++MI;
1393
1394 if (canAffectGlobalAddrSpace(AddrSpace)) {
1395 switch (Scope) {
1396 case SIAtomicScope::SYSTEM:
1397 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1398 // hardware does not reorder memory operations by the same wave with
1399 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1400 // to initiate writeback of any dirty cache lines of earlier writes by
1401 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1402 // writeback has completed.
1403 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1404 // Set SC bits to indicate system scope.
1406 Changed = true;
1407 break;
1408 case SIAtomicScope::AGENT:
1409 if (ST.hasGFX940Insts()) {
1410 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1411 // Set SC bits to indicate agent scope.
1413
1414 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1415 // SIAtomicScope::AGENT, the following insertWait will generate the
1416 // required "S_WAITCNT vmcnt(0)".
1417 Changed = true;
1418 }
1419 break;
1420 case SIAtomicScope::WORKGROUP:
1421 case SIAtomicScope::WAVEFRONT:
1422 case SIAtomicScope::SINGLETHREAD:
1423 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1424 // would writeback, and would require an otherwise unnecessary
1425 // "S_WAITCNT vmcnt(0)".
1426 break;
1427 default:
1428 llvm_unreachable("Unsupported synchronization scope");
1429 }
1430 }
1431
1432 if (Pos == Position::AFTER)
1433 --MI;
1434 }
1435
1436 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1437 // S_WAITCNT needed.
1438 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1439 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1440 /*AtomicsOnly=*/false);
1441
1442 return Changed;
1443}
1444
1445bool SIGfx10CacheControl::enableLoadCacheBypass(
1446 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1447 SIAtomicAddrSpace AddrSpace) const {
1448 assert(MI->mayLoad() && !MI->mayStore());
1449 bool Changed = false;
1450
1451 if (canAffectGlobalAddrSpace(AddrSpace)) {
1452 switch (Scope) {
1453 case SIAtomicScope::SYSTEM:
1454 case SIAtomicScope::AGENT:
1455 // Set the L0 and L1 cache policies to MISS_EVICT.
1456 // Note: there is no L2 cache coherent bypass control at the ISA level.
1457 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1458 Changed |=
1459 enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
1460 break;
1461 case SIAtomicScope::WORKGROUP:
1462 // In WGP mode the waves of a work-group can be executing on either CU of
1463 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1464 // CU mode all waves of a work-group are on the same CU, and so the L0
1465 // does not need to be bypassed.
1466 if (!ST.isCuModeEnabled())
1467 Changed |= enableCPolBits(MI, CPol::GLC);
1468 break;
1469 case SIAtomicScope::WAVEFRONT:
1470 case SIAtomicScope::SINGLETHREAD:
1471 // No cache to bypass.
1472 break;
1473 default:
1474 llvm_unreachable("Unsupported synchronization scope");
1475 }
1476 }
1477
1478 /// The scratch address space does not need the global memory caches
1479 /// to be bypassed as all memory operations by the same thread are
1480 /// sequentially consistent, and no other thread can access scratch
1481 /// memory.
1482
1483 /// Other address spaces do not have a cache.
1484
1485 return Changed;
1486}
1487
1488bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1489 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1490 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1491
1492 // Only handle load and store, not atomic read-modify-write insructions. The
1493 // latter use glc to indicate if the atomic returns a result and so must not
1494 // be used for cache control.
1495 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1496
1497 // Only update load and store, not LLVM IR atomic read-modify-write
1498 // instructions. The latter are always marked as volatile so cannot sensibly
1499 // handle it as do not want to pessimize all atomics. Also they do not support
1500 // the nontemporal attribute.
1501 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1502
1503 bool Changed = false;
1504
1505 if (IsVolatile) {
1506 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1507 // and MISS_LRU for store instructions.
1508 // Note: there is no L2 cache coherent bypass control at the ISA level.
1509 if (Op == SIMemOp::LOAD) {
1510 Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1511 }
1512
1513 // GFX11: Set MALL NOALLOC for both load and store instructions.
1514 if (AMDGPU::isGFX11(ST))
1515 Changed |= enableCPolBits(MI, CPol::DLC);
1516
1517 // Ensure operation has completed at system scope to cause all volatile
1518 // operations to be visible outside the program in a global order. Do not
1519 // request cross address space as only the global address space can be
1520 // observable outside the program, so no need to cause a waitcnt for LDS
1521 // address space operations.
1522 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1523 Position::AFTER, AtomicOrdering::Unordered,
1524 /*AtomicsOnly=*/false);
1525 return Changed;
1526 }
1527
1528 if (IsNonTemporal) {
1529 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1530 // and L2 cache policy to STREAM.
1531 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1532 // to MISS_EVICT and the L2 cache policy to STREAM.
1533 if (Op == SIMemOp::STORE)
1534 Changed |= enableCPolBits(MI, CPol::GLC);
1535 Changed |= enableCPolBits(MI, CPol::SLC);
1536
1537 // GFX11: Set MALL NOALLOC for both load and store instructions.
1538 if (AMDGPU::isGFX11(ST))
1539 Changed |= enableCPolBits(MI, CPol::DLC);
1540
1541 return Changed;
1542 }
1543
1544 return Changed;
1545}
1546
1547bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1548 SIAtomicScope Scope,
1549 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1550 bool IsCrossAddrSpaceOrdering,
1551 Position Pos, AtomicOrdering Order,
1552 bool AtomicsOnly) const {
1553 bool Changed = false;
1554
1555 MachineBasicBlock &MBB = *MI->getParent();
1556 DebugLoc DL = MI->getDebugLoc();
1557
1558 if (Pos == Position::AFTER)
1559 ++MI;
1560
1561 bool VMCnt = false;
1562 bool VSCnt = false;
1563 bool LGKMCnt = false;
1564
1565 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1566 SIAtomicAddrSpace::NONE) {
1567 switch (Scope) {
1568 case SIAtomicScope::SYSTEM:
1569 case SIAtomicScope::AGENT:
1570 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1571 VMCnt |= true;
1572 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1573 VSCnt |= true;
1574 break;
1575 case SIAtomicScope::WORKGROUP:
1576 // In WGP mode the waves of a work-group can be executing on either CU of
1577 // the WGP. Therefore need to wait for operations to complete to ensure
1578 // they are visible to waves in the other CU as the L0 is per CU.
1579 // Otherwise in CU mode and all waves of a work-group are on the same CU
1580 // which shares the same L0. Note that we still need to wait when
1581 // performing a release in this mode to respect the transitivity of
1582 // happens-before, e.g. other waves of the workgroup must be able to
1583 // release the memory from another wave at a wider scope.
1584 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
1585 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1586 VMCnt |= true;
1587 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1588 VSCnt |= true;
1589 }
1590 break;
1591 case SIAtomicScope::WAVEFRONT:
1592 case SIAtomicScope::SINGLETHREAD:
1593 // The L0 cache keeps all memory operations in order for
1594 // work-items in the same wavefront.
1595 break;
1596 default:
1597 llvm_unreachable("Unsupported synchronization scope");
1598 }
1599 }
1600
1601 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1602 switch (Scope) {
1603 case SIAtomicScope::SYSTEM:
1604 case SIAtomicScope::AGENT:
1605 case SIAtomicScope::WORKGROUP:
1606 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1607 // not needed as LDS operations for all waves are executed in a total
1608 // global ordering as observed by all waves. Required if also
1609 // synchronizing with global/GDS memory as LDS operations could be
1610 // reordered with respect to later global/GDS memory operations of the
1611 // same wave.
1612 LGKMCnt |= IsCrossAddrSpaceOrdering;
1613 break;
1614 case SIAtomicScope::WAVEFRONT:
1615 case SIAtomicScope::SINGLETHREAD:
1616 // The LDS keeps all memory operations in order for
1617 // the same wavefront.
1618 break;
1619 default:
1620 llvm_unreachable("Unsupported synchronization scope");
1621 }
1622 }
1623
1624 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1625 switch (Scope) {
1626 case SIAtomicScope::SYSTEM:
1627 case SIAtomicScope::AGENT:
1628 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1629 // is not needed as GDS operations for all waves are executed in a total
1630 // global ordering as observed by all waves. Required if also
1631 // synchronizing with global/LDS memory as GDS operations could be
1632 // reordered with respect to later global/LDS memory operations of the
1633 // same wave.
1634 LGKMCnt |= IsCrossAddrSpaceOrdering;
1635 break;
1636 case SIAtomicScope::WORKGROUP:
1637 case SIAtomicScope::WAVEFRONT:
1638 case SIAtomicScope::SINGLETHREAD:
1639 // The GDS keeps all memory operations in order for
1640 // the same work-group.
1641 break;
1642 default:
1643 llvm_unreachable("Unsupported synchronization scope");
1644 }
1645 }
1646
1647 if (VMCnt || LGKMCnt) {
1648 unsigned WaitCntImmediate =
1650 VMCnt ? 0 : getVmcntBitMask(IV),
1652 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1653 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1654 .addImm(WaitCntImmediate);
1655 Changed = true;
1656 }
1657
1658 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1659 // at workgroup-scoped release operations that specify the LDS address space.
1660 // SIInsertWaitcnts will later replace this with a vmcnt().
1661 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1662 Scope == SIAtomicScope::WORKGROUP &&
1663 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1664 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1665 Changed = true;
1666 }
1667
1668 if (VSCnt) {
1669 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1670 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1671 .addImm(0);
1672 Changed = true;
1673 }
1674
1675 if (Pos == Position::AFTER)
1676 --MI;
1677
1678 return Changed;
1679}
1680
1681bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1682 SIAtomicScope Scope,
1683 SIAtomicAddrSpace AddrSpace,
1684 Position Pos) const {
1685 if (!InsertCacheInv)
1686 return false;
1687
1688 bool Changed = false;
1689
1690 MachineBasicBlock &MBB = *MI->getParent();
1691 DebugLoc DL = MI->getDebugLoc();
1692
1693 if (Pos == Position::AFTER)
1694 ++MI;
1695
1696 if (canAffectGlobalAddrSpace(AddrSpace)) {
1697 switch (Scope) {
1698 case SIAtomicScope::SYSTEM:
1699 case SIAtomicScope::AGENT:
1700 // The order of invalidates matter here. We must invalidate "outer in"
1701 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1702 // invalidated.
1703 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1704 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1705 Changed = true;
1706 break;
1707 case SIAtomicScope::WORKGROUP:
1708 // In WGP mode the waves of a work-group can be executing on either CU of
1709 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1710 // in CU mode and all waves of a work-group are on the same CU, and so the
1711 // L0 does not need to be invalidated.
1712 if (!ST.isCuModeEnabled()) {
1713 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1714 Changed = true;
1715 }
1716 break;
1717 case SIAtomicScope::WAVEFRONT:
1718 case SIAtomicScope::SINGLETHREAD:
1719 // No cache to invalidate.
1720 break;
1721 default:
1722 llvm_unreachable("Unsupported synchronization scope");
1723 }
1724 }
1725
1726 /// The scratch address space does not need the global memory cache
1727 /// to be flushed as all memory operations by the same thread are
1728 /// sequentially consistent, and no other thread can access scratch
1729 /// memory.
1730
1731 /// Other address spaces do not have a cache.
1732
1733 if (Pos == Position::AFTER)
1734 --MI;
1735
1736 return Changed;
1737}
1738
1739bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1740 AMDGPU::CPol::CPol Value) const {
1741 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1742 if (!CPol)
1743 return false;
1744
1745 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1746 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1747 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1748 return true;
1749 }
1750
1751 return false;
1752}
1753
1754bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1755 AMDGPU::CPol::CPol Value) const {
1756 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1757 if (!CPol)
1758 return false;
1759
1760 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1761 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1762 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1763 return true;
1764 }
1765
1766 return false;
1767}
1768
1769bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1770 const MachineBasicBlock::iterator MI) const {
1771 // TODO: implement flag for frontend to give us a hint not to insert waits.
1772
1773 MachineBasicBlock &MBB = *MI->getParent();
1774 const DebugLoc &DL = MI->getDebugLoc();
1775
1776 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
1777 if (ST.hasImageInsts()) {
1778 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
1779 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
1780 }
1781 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
1782 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
1783
1784 return true;
1785}
1786
1787bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1788 SIAtomicScope Scope,
1789 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1790 bool IsCrossAddrSpaceOrdering,
1791 Position Pos, AtomicOrdering Order,
1792 bool AtomicsOnly) const {
1793 bool Changed = false;
1794
1795 MachineBasicBlock &MBB = *MI->getParent();
1796 DebugLoc DL = MI->getDebugLoc();
1797
1798 bool LOADCnt = false;
1799 bool DSCnt = false;
1800 bool STORECnt = false;
1801
1802 if (Pos == Position::AFTER)
1803 ++MI;
1804
1805 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1806 SIAtomicAddrSpace::NONE) {
1807 switch (Scope) {
1808 case SIAtomicScope::SYSTEM:
1809 case SIAtomicScope::AGENT:
1810 case SIAtomicScope::CLUSTER:
1811 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1812 LOADCnt |= true;
1813 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1814 STORECnt |= true;
1815 break;
1816 case SIAtomicScope::WORKGROUP:
1817 // GFX12.0:
1818 // In WGP mode the waves of a work-group can be executing on either CU
1819 // of the WGP. Therefore need to wait for operations to complete to
1820 // ensure they are visible to waves in the other CU as the L0 is per CU.
1821 //
1822 // Otherwise in CU mode and all waves of a work-group are on the same CU
1823 // which shares the same L0. Note that we still need to wait when
1824 // performing a release in this mode to respect the transitivity of
1825 // happens-before, e.g. other waves of the workgroup must be able to
1826 // release the memory from another wave at a wider scope.
1827 //
1828 // GFX12.5:
1829 // CU$ has two ports. To ensure operations are visible at the workgroup
1830 // level, we need to ensure all operations in this port have completed
1831 // so the other SIMDs in the WG can see them. There is no ordering
1832 // guarantee between the ports.
1833 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1834 isReleaseOrStronger(Order)) {
1835 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1836 LOADCnt |= true;
1837 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1838 STORECnt |= true;
1839 }
1840 break;
1841 case SIAtomicScope::WAVEFRONT:
1842 case SIAtomicScope::SINGLETHREAD:
1843 // The L0 cache keeps all memory operations in order for
1844 // work-items in the same wavefront.
1845 break;
1846 default:
1847 llvm_unreachable("Unsupported synchronization scope");
1848 }
1849 }
1850
1851 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1852 switch (Scope) {
1853 case SIAtomicScope::SYSTEM:
1854 case SIAtomicScope::AGENT:
1855 case SIAtomicScope::CLUSTER:
1856 case SIAtomicScope::WORKGROUP:
1857 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1858 // not needed as LDS operations for all waves are executed in a total
1859 // global ordering as observed by all waves. Required if also
1860 // synchronizing with global/GDS memory as LDS operations could be
1861 // reordered with respect to later global/GDS memory operations of the
1862 // same wave.
1863 DSCnt |= IsCrossAddrSpaceOrdering;
1864 break;
1865 case SIAtomicScope::WAVEFRONT:
1866 case SIAtomicScope::SINGLETHREAD:
1867 // The LDS keeps all memory operations in order for
1868 // the same wavefront.
1869 break;
1870 default:
1871 llvm_unreachable("Unsupported synchronization scope");
1872 }
1873 }
1874
1875 if (LOADCnt) {
1876 // Acquire sequences only need to wait on the previous atomic operation.
1877 // e.g. a typical sequence looks like
1878 // atomic load
1879 // (wait)
1880 // global_inv
1881 //
1882 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1883 // to be tracked using loadcnt.
1884 //
1885 // This also applies to fences. Fences cannot pair with an instruction
1886 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1887 if (!AtomicsOnly && ST.hasImageInsts()) {
1888 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
1889 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
1890 }
1891 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
1892 Changed = true;
1893 }
1894
1895 if (STORECnt) {
1896 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
1897 Changed = true;
1898 }
1899
1900 if (DSCnt) {
1901 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
1902 Changed = true;
1903 }
1904
1905 if (Pos == Position::AFTER)
1906 --MI;
1907
1908 return Changed;
1909}
1910
1911bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1912 SIAtomicScope Scope,
1913 SIAtomicAddrSpace AddrSpace,
1914 Position Pos) const {
1915 if (!InsertCacheInv)
1916 return false;
1917
1918 MachineBasicBlock &MBB = *MI->getParent();
1919 DebugLoc DL = MI->getDebugLoc();
1920
1921 /// The scratch address space does not need the global memory cache
1922 /// to be flushed as all memory operations by the same thread are
1923 /// sequentially consistent, and no other thread can access scratch
1924 /// memory.
1925
1926 /// Other address spaces do not have a cache.
1927 if (!canAffectGlobalAddrSpace(AddrSpace))
1928 return false;
1929
1931 switch (Scope) {
1932 case SIAtomicScope::SYSTEM:
1933 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
1934 break;
1935 case SIAtomicScope::AGENT:
1936 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1937 break;
1938 case SIAtomicScope::CLUSTER:
1939 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1940 break;
1941 case SIAtomicScope::WORKGROUP:
1942 // GFX12.0:
1943 // In WGP mode the waves of a work-group can be executing on either CU of
1944 // the WGP. Therefore we need to invalidate the L0 which is per CU.
1945 // Otherwise in CU mode all waves of a work-group are on the same CU, and
1946 // so the L0 does not need to be invalidated.
1947 //
1948 // GFX12.5 has a shared WGP$, so no invalidates are required.
1949 if (ST.isCuModeEnabled())
1950 return false;
1951
1952 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1953 break;
1954 case SIAtomicScope::WAVEFRONT:
1955 case SIAtomicScope::SINGLETHREAD:
1956 // No cache to invalidate.
1957 return false;
1958 default:
1959 llvm_unreachable("Unsupported synchronization scope");
1960 }
1961
1962 if (Pos == Position::AFTER)
1963 ++MI;
1964
1965 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
1966
1967 if (Pos == Position::AFTER)
1968 --MI;
1969
1970 return true;
1971}
1972
1973bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1974 SIAtomicScope Scope,
1975 SIAtomicAddrSpace AddrSpace,
1976 bool IsCrossAddrSpaceOrdering,
1977 Position Pos) const {
1978 bool Changed = false;
1979
1980 MachineBasicBlock &MBB = *MI->getParent();
1981 DebugLoc DL = MI->getDebugLoc();
1982
1983 // The scratch address space does not need the global memory cache
1984 // writeback as all memory operations by the same thread are
1985 // sequentially consistent, and no other thread can access scratch
1986 // memory.
1987 if (canAffectGlobalAddrSpace(AddrSpace)) {
1988 if (Pos == Position::AFTER)
1989 ++MI;
1990
1991 // global_wb is only necessary at system scope for GFX12.0,
1992 // they're also necessary at device scope for GFX12.5 as stores
1993 // cannot report completion earlier than L2.
1994 //
1995 // Emitting it for lower scopes is a slow no-op, so we omit it
1996 // for performance.
1997 switch (Scope) {
1998 case SIAtomicScope::SYSTEM:
1999 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2001 Changed = true;
2002 break;
2003 case SIAtomicScope::AGENT:
2004 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2005 if (ST.hasGFX1250Insts()) {
2006 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2008 Changed = true;
2009 }
2010 break;
2011 case SIAtomicScope::CLUSTER:
2012 case SIAtomicScope::WORKGROUP:
2013 // No WB necessary, but we still have to wait.
2014 case SIAtomicScope::WAVEFRONT:
2015 case SIAtomicScope::SINGLETHREAD:
2016 // No WB or wait necessary here, but insertWait takes care of that.
2017 break;
2018 default:
2019 llvm_unreachable("Unsupported synchronization scope");
2020 }
2021
2022 if (Pos == Position::AFTER)
2023 --MI;
2024 }
2025
2026 // We always have to wait for previous memory operations (load/store) to
2027 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2028 // we of course need to wait for that as well.
2029 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2030 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2031 /*AtomicsOnly=*/false);
2032
2033 return Changed;
2034}
2035
2036bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2037 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2038 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2039
2040 // Only handle load and store, not atomic read-modify-write instructions.
2041 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2042
2043 // Only update load and store, not LLVM IR atomic read-modify-write
2044 // instructions. The latter are always marked as volatile so cannot sensibly
2045 // handle it as do not want to pessimize all atomics. Also they do not support
2046 // the nontemporal attribute.
2047 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2048
2049 bool Changed = false;
2050
2051 if (IsLastUse) {
2052 // Set last-use hint.
2053 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2054 } else if (IsNonTemporal) {
2055 // Set non-temporal hint for all cache levels.
2056 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2057 }
2058
2059 if (IsVolatile) {
2060 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2061
2062 // Ensure operation has completed at system scope to cause all volatile
2063 // operations to be visible outside the program in a global order. Do not
2064 // request cross address space as only the global address space can be
2065 // observable outside the program, so no need to cause a waitcnt for LDS
2066 // address space operations.
2067 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2068 Position::AFTER, AtomicOrdering::Unordered,
2069 /*AtomicsOnly=*/false);
2070 }
2071
2072 return Changed;
2073}
2074
2075bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2076 assert(MI.mayStore() && "Not a Store inst");
2077 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2078 bool Changed = false;
2079
2080 // GFX12.5 only: xcnt wait is needed before flat and global atomics
2081 // stores/rmw.
2082 if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2083 MachineBasicBlock &MBB = *MI.getParent();
2084 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2085 Changed = true;
2086 }
2087
2088 // Remaining fixes do not apply to RMWs.
2089 if (IsRMW)
2090 return Changed;
2091
2092 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2093 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2094 return Changed;
2095 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2096
2097 // GFX12.0 only: Extra waits needed before system scope stores.
2098 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2099 Scope == CPol::SCOPE_SYS)
2100 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2101
2102 return Changed;
2103}
2104
2105bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2106 if (!ST.hasGFX1250Insts())
2107 return false;
2108
2109 // Cooperative atomics need to be SCOPE_DEV or higher.
2110 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2111 assert(CPol && "No CPol operand?");
2112 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2113 if (Scope < CPol::SCOPE_DEV)
2114 return setScope(MI, CPol::SCOPE_DEV);
2115 return false;
2116}
2117
2118bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2119 SIAtomicScope Scope,
2120 SIAtomicAddrSpace AddrSpace) const {
2121 bool Changed = false;
2122
2123 if (canAffectGlobalAddrSpace(AddrSpace)) {
2124 switch (Scope) {
2125 case SIAtomicScope::SYSTEM:
2126 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2127 break;
2128 case SIAtomicScope::AGENT:
2129 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2130 break;
2131 case SIAtomicScope::CLUSTER:
2132 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2133 break;
2134 case SIAtomicScope::WORKGROUP:
2135 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2136 // different CUs that access different L0s.
2137 if (!ST.isCuModeEnabled())
2138 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2139 break;
2140 case SIAtomicScope::WAVEFRONT:
2141 case SIAtomicScope::SINGLETHREAD:
2142 // No cache to bypass.
2143 break;
2144 default:
2145 llvm_unreachable("Unsupported synchronization scope");
2146 }
2147 }
2148
2149 // The scratch address space does not need the global memory caches
2150 // to be bypassed as all memory operations by the same thread are
2151 // sequentially consistent, and no other thread can access scratch
2152 // memory.
2153
2154 // Other address spaces do not have a cache.
2155
2156 return Changed;
2157}
2158
2159bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2160 if (AtomicPseudoMIs.empty())
2161 return false;
2162
2163 for (auto &MI : AtomicPseudoMIs)
2164 MI->eraseFromParent();
2165
2166 AtomicPseudoMIs.clear();
2167 return true;
2168}
2169
2170bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2172 assert(MI->mayLoad() && !MI->mayStore());
2173
2174 bool Changed = false;
2175
2176 if (MOI.isAtomic()) {
2177 const AtomicOrdering Order = MOI.getOrdering();
2178 if (Order == AtomicOrdering::Monotonic ||
2179 Order == AtomicOrdering::Acquire ||
2180 Order == AtomicOrdering::SequentiallyConsistent) {
2181 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2182 MOI.getOrderingAddrSpace());
2183 }
2184
2185 // Handle cooperative atomics after cache bypass step, as it may override
2186 // the scope of the instruction to a greater scope.
2187 if (MOI.isCooperative())
2188 Changed |= CC->handleCooperativeAtomic(*MI);
2189
2190 if (Order == AtomicOrdering::SequentiallyConsistent)
2191 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2192 SIMemOp::LOAD | SIMemOp::STORE,
2193 MOI.getIsCrossAddressSpaceOrdering(),
2194 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2195
2196 if (Order == AtomicOrdering::Acquire ||
2197 Order == AtomicOrdering::SequentiallyConsistent) {
2198 // The wait below only needs to wait on the prior atomic.
2199 Changed |=
2200 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2201 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2202 Position::AFTER, Order, /*AtomicsOnly=*/true);
2203 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2204 MOI.getOrderingAddrSpace(),
2205 Position::AFTER);
2206 }
2207
2208 return Changed;
2209 }
2210
2211 // Atomic instructions already bypass caches to the scope specified by the
2212 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2213 // instructions need additional treatment.
2214 Changed |= CC->enableVolatileAndOrNonTemporal(
2215 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2216 MOI.isNonTemporal(), MOI.isLastUse());
2217
2218 return Changed;
2219}
2220
2221bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2223 assert(!MI->mayLoad() && MI->mayStore());
2224
2225 bool Changed = false;
2226 // FIXME: Necessary hack because iterator can lose track of the store.
2227 MachineInstr &StoreMI = *MI;
2228
2229 if (MOI.isAtomic()) {
2230 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2231 MOI.getOrdering() == AtomicOrdering::Release ||
2232 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2233 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2234 MOI.getOrderingAddrSpace());
2235 }
2236
2237 // Handle cooperative atomics after cache bypass step, as it may override
2238 // the scope of the instruction to a greater scope.
2239 if (MOI.isCooperative())
2240 Changed |= CC->handleCooperativeAtomic(*MI);
2241
2242 if (MOI.getOrdering() == AtomicOrdering::Release ||
2243 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2244 Changed |= CC->insertRelease(MI, MOI.getScope(),
2245 MOI.getOrderingAddrSpace(),
2246 MOI.getIsCrossAddressSpaceOrdering(),
2247 Position::BEFORE);
2248
2249 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2250 return Changed;
2251 }
2252
2253 // Atomic instructions already bypass caches to the scope specified by the
2254 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2255 // need additional treatment.
2256 Changed |= CC->enableVolatileAndOrNonTemporal(
2257 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2258 MOI.isNonTemporal());
2259
2260 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2261 // instruction field, do not confuse it with atomic scope.
2262 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2263 return Changed;
2264}
2265
2266bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2268 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2269
2270 AtomicPseudoMIs.push_back(MI);
2271 bool Changed = false;
2272
2273 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2274
2275 if (MOI.isAtomic()) {
2276 const AtomicOrdering Order = MOI.getOrdering();
2277 if (Order == AtomicOrdering::Acquire) {
2278 // Acquire fences only need to wait on the previous atomic they pair with.
2279 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2280 SIMemOp::LOAD | SIMemOp::STORE,
2281 MOI.getIsCrossAddressSpaceOrdering(),
2282 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2283 }
2284
2285 if (Order == AtomicOrdering::Release ||
2286 Order == AtomicOrdering::AcquireRelease ||
2287 Order == AtomicOrdering::SequentiallyConsistent)
2288 /// TODO: This relies on a barrier always generating a waitcnt
2289 /// for LDS to ensure it is not reordered with the completion of
2290 /// the proceeding LDS operations. If barrier had a memory
2291 /// ordering and memory scope, then library does not need to
2292 /// generate a fence. Could add support in this file for
2293 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2294 /// adding S_WAITCNT before a S_BARRIER.
2295 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2296 MOI.getIsCrossAddressSpaceOrdering(),
2297 Position::BEFORE);
2298
2299 // TODO: If both release and invalidate are happening they could be combined
2300 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2301 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2302 // track cache invalidate and write back instructions.
2303
2304 if (Order == AtomicOrdering::Acquire ||
2305 Order == AtomicOrdering::AcquireRelease ||
2306 Order == AtomicOrdering::SequentiallyConsistent)
2307 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2308 Position::BEFORE);
2309
2310 return Changed;
2311 }
2312
2313 return Changed;
2314}
2315
2316bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2318 assert(MI->mayLoad() && MI->mayStore());
2319
2320 bool Changed = false;
2321 MachineInstr &RMWMI = *MI;
2322
2323 if (MOI.isAtomic()) {
2324 const AtomicOrdering Order = MOI.getOrdering();
2325 if (Order == AtomicOrdering::Monotonic ||
2326 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2327 Order == AtomicOrdering::AcquireRelease ||
2328 Order == AtomicOrdering::SequentiallyConsistent) {
2329 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2330 MOI.getInstrAddrSpace());
2331 }
2332
2333 if (Order == AtomicOrdering::Release ||
2334 Order == AtomicOrdering::AcquireRelease ||
2335 Order == AtomicOrdering::SequentiallyConsistent ||
2336 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2337 Changed |= CC->insertRelease(MI, MOI.getScope(),
2338 MOI.getOrderingAddrSpace(),
2339 MOI.getIsCrossAddressSpaceOrdering(),
2340 Position::BEFORE);
2341
2342 if (Order == AtomicOrdering::Acquire ||
2343 Order == AtomicOrdering::AcquireRelease ||
2344 Order == AtomicOrdering::SequentiallyConsistent ||
2345 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2346 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2347 // Only wait on the previous atomic.
2348 Changed |=
2349 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2350 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2351 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2352 Order, /*AtomicsOnly=*/true);
2353 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2354 MOI.getOrderingAddrSpace(),
2355 Position::AFTER);
2356 }
2357
2358 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2359 return Changed;
2360 }
2361
2362 return Changed;
2363}
2364
2365bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2367 assert(MI->mayLoad() && MI->mayStore());
2368
2369 // The volatility or nontemporal-ness of the operation is a
2370 // function of the global memory, not the LDS.
2371 SIMemOp OpKind =
2372 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2373
2374 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2375 // stores. The operation is treated as a volatile/nontemporal store
2376 // to its second argument.
2377 return CC->enableVolatileAndOrNonTemporal(
2378 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2379 MOI.isNonTemporal(), MOI.isLastUse());
2380}
2381
2382bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2383 const MachineModuleInfo &MMI =
2384 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2385 return SIMemoryLegalizer(MMI).run(MF);
2386}
2387
2388PreservedAnalyses
2392 .getCachedResult<MachineModuleAnalysis>(
2393 *MF.getFunction().getParent());
2394 assert(MMI && "MachineModuleAnalysis must be available");
2395 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2396 return PreservedAnalyses::all();
2398}
2399
2400bool SIMemoryLegalizer::run(MachineFunction &MF) {
2401 bool Changed = false;
2402
2403 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2404 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2405 CC = SICacheControl::create(ST);
2406
2407 for (auto &MBB : MF) {
2408 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2409
2410 // Unbundle instructions after the post-RA scheduler.
2411 if (MI->isBundle() && MI->mayLoadOrStore()) {
2412 MachineBasicBlock::instr_iterator II(MI->getIterator());
2413 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2414 I != E && I->isBundledWithPred(); ++I) {
2415 I->unbundleFromPred();
2416 for (MachineOperand &MO : I->operands())
2417 if (MO.isReg())
2418 MO.setIsInternalRead(false);
2419 }
2420
2421 MI->eraseFromParent();
2422 MI = II->getIterator();
2423 }
2424
2425 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2426 continue;
2427
2428 if (const auto &MOI = MOA.getLoadInfo(MI)) {
2429 Changed |= expandLoad(*MOI, MI);
2430 } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2431 Changed |= expandStore(*MOI, MI);
2432 } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
2433 Changed |= expandLDSDMA(*MOI, MI);
2434 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
2435 Changed |= expandAtomicFence(*MOI, MI);
2436 } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
2437 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2438 }
2439 }
2440 }
2441
2442 Changed |= removeAtomicPseudoMIs();
2443 return Changed;
2444}
2445
2446INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2447
2448char SIMemoryLegalizerLegacy::ID = 0;
2449char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2450
2452 return new SIMemoryLegalizerLegacy();
2453}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST)
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
bool isGFX10(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()