LLVM 20.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
28
29using namespace llvm;
30using namespace llvm::AMDGPU;
31
32#define DEBUG_TYPE "si-memory-legalizer"
33#define PASS_NAME "SI Memory Legalizer"
34
36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37 cl::desc("Use this to skip inserting cache invalidating instructions."));
38
39namespace {
40
42
43/// Memory operation flags. Can be ORed together.
44enum class SIMemOp {
45 NONE = 0u,
46 LOAD = 1u << 0,
47 STORE = 1u << 1,
48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49};
50
51/// Position to insert a new instruction relative to an existing
52/// instruction.
53enum class Position {
54 BEFORE,
55 AFTER
56};
57
58/// The atomic synchronization scopes supported by the AMDGPU target.
59enum class SIAtomicScope {
60 NONE,
61 SINGLETHREAD,
62 WAVEFRONT,
64 AGENT,
65 SYSTEM
66};
67
68/// The distinct address spaces supported by the AMDGPU target for
69/// atomic memory operation. Can be ORed together.
70enum class SIAtomicAddrSpace {
71 NONE = 0u,
72 GLOBAL = 1u << 0,
73 LDS = 1u << 1,
74 SCRATCH = 1u << 2,
75 GDS = 1u << 3,
76 OTHER = 1u << 4,
77
78 /// The address spaces that can be accessed by a FLAT instruction.
79 FLAT = GLOBAL | LDS | SCRATCH,
80
81 /// The address spaces that support atomic instructions.
82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83
84 /// All address spaces.
85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86
87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88};
89
90class SIMemOpInfo final {
91private:
92
93 friend class SIMemOpAccess;
94
95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100 bool IsCrossAddressSpaceOrdering = false;
101 bool IsVolatile = false;
102 bool IsNonTemporal = false;
103 bool IsLastUse = false;
104
105 SIMemOpInfo(
106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110 bool IsCrossAddressSpaceOrdering = true,
111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112 bool IsVolatile = false, bool IsNonTemporal = false,
113 bool IsLastUse = false)
114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118 IsLastUse(IsLastUse) {
119
120 if (Ordering == AtomicOrdering::NotAtomic) {
121 assert(Scope == SIAtomicScope::NONE &&
122 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123 !IsCrossAddressSpaceOrdering &&
124 FailureOrdering == AtomicOrdering::NotAtomic);
125 return;
126 }
127
128 assert(Scope != SIAtomicScope::NONE &&
129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE &&
131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132 SIAtomicAddrSpace::NONE);
133
134 // There is also no cross address space ordering if the ordering
135 // address space is the same as the instruction address space and
136 // only contains a single address space.
137 if ((OrderingAddrSpace == InstrAddrSpace) &&
138 isPowerOf2_32(uint32_t(InstrAddrSpace)))
139 this->IsCrossAddressSpaceOrdering = false;
140
141 // Limit the scope to the maximum supported by the instruction's address
142 // spaces.
143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144 SIAtomicAddrSpace::NONE) {
145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146 } else if ((InstrAddrSpace &
147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148 SIAtomicAddrSpace::NONE) {
149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150 } else if ((InstrAddrSpace &
151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154 }
155 }
156
157public:
158 /// \returns Atomic synchronization scope of the machine instruction used to
159 /// create this SIMemOpInfo.
160 SIAtomicScope getScope() const {
161 return Scope;
162 }
163
164 /// \returns Ordering constraint of the machine instruction used to
165 /// create this SIMemOpInfo.
166 AtomicOrdering getOrdering() const {
167 return Ordering;
168 }
169
170 /// \returns Failure ordering constraint of the machine instruction used to
171 /// create this SIMemOpInfo.
172 AtomicOrdering getFailureOrdering() const {
173 return FailureOrdering;
174 }
175
176 /// \returns The address spaces be accessed by the machine
177 /// instruction used to create this SIMemOpInfo.
178 SIAtomicAddrSpace getInstrAddrSpace() const {
179 return InstrAddrSpace;
180 }
181
182 /// \returns The address spaces that must be ordered by the machine
183 /// instruction used to create this SIMemOpInfo.
184 SIAtomicAddrSpace getOrderingAddrSpace() const {
185 return OrderingAddrSpace;
186 }
187
188 /// \returns Return true iff memory ordering of operations on
189 /// different address spaces is required.
190 bool getIsCrossAddressSpaceOrdering() const {
191 return IsCrossAddressSpaceOrdering;
192 }
193
194 /// \returns True if memory access of the machine instruction used to
195 /// create this SIMemOpInfo is volatile, false otherwise.
196 bool isVolatile() const {
197 return IsVolatile;
198 }
199
200 /// \returns True if memory access of the machine instruction used to
201 /// create this SIMemOpInfo is nontemporal, false otherwise.
202 bool isNonTemporal() const {
203 return IsNonTemporal;
204 }
205
206 /// \returns True if memory access of the machine instruction used to
207 /// create this SIMemOpInfo is last use, false otherwise.
208 bool isLastUse() const { return IsLastUse; }
209
210 /// \returns True if ordering constraint of the machine instruction used to
211 /// create this SIMemOpInfo is unordered or higher, false otherwise.
212 bool isAtomic() const {
213 return Ordering != AtomicOrdering::NotAtomic;
214 }
215
216};
217
218class SIMemOpAccess final {
219private:
220 const AMDGPUMachineModuleInfo *MMI = nullptr;
221
222 /// Reports unsupported message \p Msg for \p MI to LLVM context.
223 void reportUnsupported(const MachineBasicBlock::iterator &MI,
224 const char *Msg) const;
225
226 /// Inspects the target synchronization scope \p SSID and determines
227 /// the SI atomic scope it corresponds to, the address spaces it
228 /// covers, and whether the memory ordering applies between address
229 /// spaces.
230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232
233 /// \return Return a bit set of the address spaces accessed by \p AS.
234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235
236 /// \returns Info constructed from \p MI, which has at least machine memory
237 /// operand.
238 std::optional<SIMemOpInfo>
239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240
241public:
242 /// Construct class to support accessing the machine memory operands
243 /// of instructions in the machine function \p MF.
244 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
245
246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247 std::optional<SIMemOpInfo>
249
250 /// \returns Store info if \p MI is a store operation, "std::nullopt"
251 /// otherwise.
252 std::optional<SIMemOpInfo>
253 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254
255 /// \returns Atomic fence info if \p MI is an atomic fence operation,
256 /// "std::nullopt" otherwise.
257 std::optional<SIMemOpInfo>
258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259
260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261 /// rmw operation, "std::nullopt" otherwise.
262 std::optional<SIMemOpInfo>
263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264};
265
266class SICacheControl {
267protected:
268
269 /// AMDGPU subtarget info.
270 const GCNSubtarget &ST;
271
272 /// Instruction info.
273 const SIInstrInfo *TII = nullptr;
274
275 IsaVersion IV;
276
277 /// Whether to insert cache invalidating instructions.
278 bool InsertCacheInv;
279
280 SICacheControl(const GCNSubtarget &ST);
281
282 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283 /// \returns Returns true if \p MI is modified, false otherwise.
284 bool enableNamedBit(const MachineBasicBlock::iterator MI,
285 AMDGPU::CPol::CPol Bit) const;
286
287public:
288
289 /// Create a cache control for the subtarget \p ST.
290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291
292 /// Update \p MI memory load instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory store instruction to bypass any caches up to
300 /// the \p Scope memory scope for address spaces \p
301 /// AddrSpace. Return true iff the instruction was modified.
302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory read-modify-write instruction to bypass any caches up
307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308 /// iff the instruction was modified.
309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310 SIAtomicScope Scope,
311 SIAtomicAddrSpace AddrSpace) const = 0;
312
313 /// Update \p MI memory instruction of kind \p Op associated with address
314 /// spaces \p AddrSpace to indicate it is volatile and/or
315 /// nontemporal/last-use. Return true iff the instruction was modified.
316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317 SIAtomicAddrSpace AddrSpace,
318 SIMemOp Op, bool IsVolatile,
319 bool IsNonTemporal,
320 bool IsLastUse = false) const = 0;
321
322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323 return false;
324 };
325
326 /// Inserts any necessary instructions at position \p Pos relative
327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328 /// \p Op associated with address spaces \p AddrSpace have completed. Used
329 /// between memory instructions to enforce the order they become visible as
330 /// observed by other memory instructions executing in memory scope \p Scope.
331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332 /// address spaces. Returns true iff any instructions inserted.
333 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
334 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
335 bool IsCrossAddrSpaceOrdering, Position Pos,
336 AtomicOrdering Order) const = 0;
337
338 /// Inserts any necessary instructions at position \p Pos relative to
339 /// instruction \p MI to ensure any subsequent memory instructions of this
340 /// thread with address spaces \p AddrSpace will observe the previous memory
341 /// operations by any thread for memory scopes up to memory scope \p Scope .
342 /// Returns true iff any instructions inserted.
343 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
344 SIAtomicScope Scope,
345 SIAtomicAddrSpace AddrSpace,
346 Position Pos) const = 0;
347
348 /// Inserts any necessary instructions at position \p Pos relative to
349 /// instruction \p MI to ensure previous memory instructions by this thread
350 /// with address spaces \p AddrSpace have completed and can be observed by
351 /// subsequent memory instructions by any thread executing in memory scope \p
352 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
353 /// between address spaces. Returns true iff any instructions inserted.
354 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
355 SIAtomicScope Scope,
356 SIAtomicAddrSpace AddrSpace,
357 bool IsCrossAddrSpaceOrdering,
358 Position Pos) const = 0;
359
360 /// Virtual destructor to allow derivations to be deleted.
361 virtual ~SICacheControl() = default;
362
363 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
365 return false;
366 }
367};
368
369class SIGfx6CacheControl : public SICacheControl {
370protected:
371
372 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
373 /// is modified, false otherwise.
374 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
375 return enableNamedBit(MI, AMDGPU::CPol::GLC);
376 }
377
378 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
379 /// is modified, false otherwise.
380 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
381 return enableNamedBit(MI, AMDGPU::CPol::SLC);
382 }
383
384public:
385
386 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
387
388 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
389 SIAtomicScope Scope,
390 SIAtomicAddrSpace AddrSpace) const override;
391
392 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
393 SIAtomicScope Scope,
394 SIAtomicAddrSpace AddrSpace) const override;
395
396 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
397 SIAtomicScope Scope,
398 SIAtomicAddrSpace AddrSpace) const override;
399
400 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
402 bool IsVolatile, bool IsNonTemporal,
403 bool IsLastUse) const override;
404
405 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
406 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
407 bool IsCrossAddrSpaceOrdering, Position Pos,
408 AtomicOrdering Order) const override;
409
410 bool insertAcquire(MachineBasicBlock::iterator &MI,
411 SIAtomicScope Scope,
412 SIAtomicAddrSpace AddrSpace,
413 Position Pos) const override;
414
415 bool insertRelease(MachineBasicBlock::iterator &MI,
416 SIAtomicScope Scope,
417 SIAtomicAddrSpace AddrSpace,
418 bool IsCrossAddrSpaceOrdering,
419 Position Pos) const override;
420};
421
422class SIGfx7CacheControl : public SIGfx6CacheControl {
423public:
424
425 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
426
427 bool insertAcquire(MachineBasicBlock::iterator &MI,
428 SIAtomicScope Scope,
429 SIAtomicAddrSpace AddrSpace,
430 Position Pos) const override;
431
432};
433
434class SIGfx90ACacheControl : public SIGfx7CacheControl {
435public:
436
437 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
438
439 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
440 SIAtomicScope Scope,
441 SIAtomicAddrSpace AddrSpace) const override;
442
443 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
444 SIAtomicScope Scope,
445 SIAtomicAddrSpace AddrSpace) const override;
446
447 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
448 SIAtomicScope Scope,
449 SIAtomicAddrSpace AddrSpace) const override;
450
451 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
452 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
453 bool IsVolatile, bool IsNonTemporal,
454 bool IsLastUse) const override;
455
456 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
458 bool IsCrossAddrSpaceOrdering, Position Pos,
459 AtomicOrdering Order) const override;
460
461 bool insertAcquire(MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace,
464 Position Pos) const override;
465
466 bool insertRelease(MachineBasicBlock::iterator &MI,
467 SIAtomicScope Scope,
468 SIAtomicAddrSpace AddrSpace,
469 bool IsCrossAddrSpaceOrdering,
470 Position Pos) const override;
471};
472
473class SIGfx940CacheControl : public SIGfx90ACacheControl {
474protected:
475
476 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
477 /// is modified, false otherwise.
478 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
479 return enableNamedBit(MI, AMDGPU::CPol::SC0);
480 }
481
482 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
483 /// is modified, false otherwise.
484 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
485 return enableNamedBit(MI, AMDGPU::CPol::SC1);
486 }
487
488 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
489 /// is modified, false otherwise.
490 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
491 return enableNamedBit(MI, AMDGPU::CPol::NT);
492 }
493
494public:
495
496 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
497
498 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
499 SIAtomicScope Scope,
500 SIAtomicAddrSpace AddrSpace) const override;
501
502 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
503 SIAtomicScope Scope,
504 SIAtomicAddrSpace AddrSpace) const override;
505
506 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
507 SIAtomicScope Scope,
508 SIAtomicAddrSpace AddrSpace) const override;
509
510 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
511 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
512 bool IsVolatile, bool IsNonTemporal,
513 bool IsLastUse) const override;
514
515 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
516 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
517
518 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
519 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
520 Position Pos) const override;
521
522 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
523 MachineBasicBlock::iterator &MI) const override {
524 bool Changed = false;
525 if (ST.hasForceStoreSC0SC1() &&
526 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
527 SIAtomicAddrSpace::GLOBAL |
528 SIAtomicAddrSpace::OTHER)) !=
529 SIAtomicAddrSpace::NONE) {
530 Changed |= enableSC0Bit(MI);
531 Changed |= enableSC1Bit(MI);
532 }
533 return Changed;
534 }
535};
536
537class SIGfx10CacheControl : public SIGfx7CacheControl {
538protected:
539
540 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
541 /// is modified, false otherwise.
542 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
543 return enableNamedBit(MI, AMDGPU::CPol::DLC);
544 }
545
546public:
547
548 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
549
550 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
551 SIAtomicScope Scope,
552 SIAtomicAddrSpace AddrSpace) const override;
553
554 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
555 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
556 bool IsVolatile, bool IsNonTemporal,
557 bool IsLastUse) const override;
558
559 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
560 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
561 bool IsCrossAddrSpaceOrdering, Position Pos,
562 AtomicOrdering Order) const override;
563
564 bool insertAcquire(MachineBasicBlock::iterator &MI,
565 SIAtomicScope Scope,
566 SIAtomicAddrSpace AddrSpace,
567 Position Pos) const override;
568};
569
570class SIGfx11CacheControl : public SIGfx10CacheControl {
571public:
572 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
573
574 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
575 SIAtomicScope Scope,
576 SIAtomicAddrSpace AddrSpace) const override;
577
578 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
579 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
580 bool IsVolatile, bool IsNonTemporal,
581 bool IsLastUse) const override;
582};
583
584class SIGfx12CacheControl : public SIGfx11CacheControl {
585protected:
586 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
587 // \returns Returns true if \p MI is modified, false otherwise.
588 bool setTH(const MachineBasicBlock::iterator MI,
590 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
591 // MI. \returns Returns true if \p MI is modified, false otherwise.
592 bool setScope(const MachineBasicBlock::iterator MI,
594
595 // Stores with system scope (SCOPE_SYS) need to wait for:
596 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
597 // - non-returning-atomics - wait for STORECNT==0
598 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
599 // since it does not distinguish atomics-with-return from regular stores.
600 // There is no need to wait if memory is cached (mtype != UC).
601 bool
602 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
603
604 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
605 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
606
607public:
608 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
609
610 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
611 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
612 bool IsCrossAddrSpaceOrdering, Position Pos,
613 AtomicOrdering Order) const override;
614
615 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
616 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
617
618 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
619 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620 bool IsVolatile, bool IsNonTemporal,
621 bool IsLastUse) const override;
622
623 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
624
625 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
626 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
627 Position Pos) const override;
628
629 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
630 SIAtomicScope Scope,
631 SIAtomicAddrSpace AddrSpace) const override {
632 return setAtomicScope(MI, Scope, AddrSpace);
633 }
634
635 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
636 SIAtomicScope Scope,
637 SIAtomicAddrSpace AddrSpace) const override {
638 return setAtomicScope(MI, Scope, AddrSpace);
639 }
640
641 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
642 SIAtomicScope Scope,
643 SIAtomicAddrSpace AddrSpace) const override {
644 return setAtomicScope(MI, Scope, AddrSpace);
645 }
646};
647
648class SIMemoryLegalizer final : public MachineFunctionPass {
649private:
650
651 /// Cache Control.
652 std::unique_ptr<SICacheControl> CC = nullptr;
653
654 /// List of atomic pseudo instructions.
655 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
656
657 /// Return true iff instruction \p MI is a atomic instruction that
658 /// returns a result.
659 bool isAtomicRet(const MachineInstr &MI) const {
661 }
662
663 /// Removes all processed atomic pseudo instructions from the current
664 /// function. Returns true if current function is modified, false otherwise.
665 bool removeAtomicPseudoMIs();
666
667 /// Expands load operation \p MI. Returns true if instructions are
668 /// added/deleted or \p MI is modified, false otherwise.
669 bool expandLoad(const SIMemOpInfo &MOI,
671 /// Expands store operation \p MI. Returns true if instructions are
672 /// added/deleted or \p MI is modified, false otherwise.
673 bool expandStore(const SIMemOpInfo &MOI,
675 /// Expands atomic fence operation \p MI. Returns true if
676 /// instructions are added/deleted or \p MI is modified, false otherwise.
677 bool expandAtomicFence(const SIMemOpInfo &MOI,
679 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
680 /// instructions are added/deleted or \p MI is modified, false otherwise.
681 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
683
684public:
685 static char ID;
686
687 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
688
689 void getAnalysisUsage(AnalysisUsage &AU) const override {
690 AU.setPreservesCFG();
692 }
693
694 StringRef getPassName() const override {
695 return PASS_NAME;
696 }
697
698 bool runOnMachineFunction(MachineFunction &MF) override;
699};
700
701static const StringMap<SIAtomicAddrSpace> ASNames = {{
702 {"global", SIAtomicAddrSpace::GLOBAL},
703 {"local", SIAtomicAddrSpace::LDS},
704}};
705
706void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
707 const MachineFunction *MF = MI.getMF();
708 const Function &Fn = MF->getFunction();
711 OS << "unknown address space '" << AS << "'; expected one of ";
712 ListSeparator LS;
713 for (const auto &[Name, Val] : ASNames)
714 OS << LS << '\'' << Name << '\'';
715 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
716 Fn.getContext().diagnose(BadTag);
717}
718
719/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
720/// If this tag isn't present, or if it has no meaningful values, returns \p
721/// Default. Otherwise returns all the address spaces concerned by the MMRA.
722static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
723 SIAtomicAddrSpace Default) {
724 static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
725
726 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
727 if (!MMRA)
728 return Default;
729
730 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
731 for (const auto &[Prefix, Suffix] : MMRA) {
732 if (Prefix != FenceASPrefix)
733 continue;
734
735 if (auto It = ASNames.find(Suffix); It != ASNames.end())
736 Result |= It->second;
737 else
738 diagnoseUnknownMMRAASName(MI, Suffix);
739 }
740
741 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
742}
743
744} // end anonymous namespace
745
746void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
747 const char *Msg) const {
748 const Function &Func = MI->getParent()->getParent()->getFunction();
749 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
750 Func.getContext().diagnose(Diag);
751}
752
753std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
754SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
755 SIAtomicAddrSpace InstrAddrSpace) const {
756 if (SSID == SyncScope::System)
757 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
758 if (SSID == MMI->getAgentSSID())
759 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
760 if (SSID == MMI->getWorkgroupSSID())
761 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
762 true);
763 if (SSID == MMI->getWavefrontSSID())
764 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
765 true);
766 if (SSID == SyncScope::SingleThread)
767 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
768 true);
769 if (SSID == MMI->getSystemOneAddressSpaceSSID())
770 return std::tuple(SIAtomicScope::SYSTEM,
771 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772 if (SSID == MMI->getAgentOneAddressSpaceSSID())
773 return std::tuple(SIAtomicScope::AGENT,
774 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
775 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
776 return std::tuple(SIAtomicScope::WORKGROUP,
777 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
778 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
779 return std::tuple(SIAtomicScope::WAVEFRONT,
780 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
781 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
782 return std::tuple(SIAtomicScope::SINGLETHREAD,
783 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
784 return std::nullopt;
785}
786
787SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
788 if (AS == AMDGPUAS::FLAT_ADDRESS)
789 return SIAtomicAddrSpace::FLAT;
790 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
791 return SIAtomicAddrSpace::GLOBAL;
792 if (AS == AMDGPUAS::LOCAL_ADDRESS)
793 return SIAtomicAddrSpace::LDS;
795 return SIAtomicAddrSpace::SCRATCH;
796 if (AS == AMDGPUAS::REGION_ADDRESS)
797 return SIAtomicAddrSpace::GDS;
798
799 return SIAtomicAddrSpace::OTHER;
800}
801
802SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
803 : MMI(&MMI_) {}
804
805std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
806 const MachineBasicBlock::iterator &MI) const {
807 assert(MI->getNumMemOperands() > 0);
808
810 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
811 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
812 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
813 bool IsNonTemporal = true;
814 bool IsVolatile = false;
815 bool IsLastUse = false;
816
817 // Validator should check whether or not MMOs cover the entire set of
818 // locations accessed by the memory instruction.
819 for (const auto &MMO : MI->memoperands()) {
820 IsNonTemporal &= MMO->isNonTemporal();
821 IsVolatile |= MMO->isVolatile();
822 IsLastUse |= MMO->getFlags() & MOLastUse;
823 InstrAddrSpace |=
824 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
825 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
826 if (OpOrdering != AtomicOrdering::NotAtomic) {
827 const auto &IsSyncScopeInclusion =
828 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
829 if (!IsSyncScopeInclusion) {
830 reportUnsupported(MI,
831 "Unsupported non-inclusive atomic synchronization scope");
832 return std::nullopt;
833 }
834
835 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
836 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
837 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
838 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
839 FailureOrdering =
840 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
841 }
842 }
843
844 SIAtomicScope Scope = SIAtomicScope::NONE;
845 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
846 bool IsCrossAddressSpaceOrdering = false;
847 if (Ordering != AtomicOrdering::NotAtomic) {
848 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
849 if (!ScopeOrNone) {
850 reportUnsupported(MI, "Unsupported atomic synchronization scope");
851 return std::nullopt;
852 }
853 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
854 *ScopeOrNone;
855 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
856 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
857 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
858 reportUnsupported(MI, "Unsupported atomic address space");
859 return std::nullopt;
860 }
861 }
862 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
863 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
864 IsNonTemporal, IsLastUse);
865}
866
867std::optional<SIMemOpInfo>
868SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
869 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
870
871 if (!(MI->mayLoad() && !MI->mayStore()))
872 return std::nullopt;
873
874 // Be conservative if there are no memory operands.
875 if (MI->getNumMemOperands() == 0)
876 return SIMemOpInfo();
877
878 return constructFromMIWithMMO(MI);
879}
880
881std::optional<SIMemOpInfo>
882SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
883 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
884
885 if (!(!MI->mayLoad() && MI->mayStore()))
886 return std::nullopt;
887
888 // Be conservative if there are no memory operands.
889 if (MI->getNumMemOperands() == 0)
890 return SIMemOpInfo();
891
892 return constructFromMIWithMMO(MI);
893}
894
895std::optional<SIMemOpInfo>
896SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
897 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
898
899 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
900 return std::nullopt;
901
903 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
904
905 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
906 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
907 if (!ScopeOrNone) {
908 reportUnsupported(MI, "Unsupported atomic synchronization scope");
909 return std::nullopt;
910 }
911
912 SIAtomicScope Scope = SIAtomicScope::NONE;
913 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
914 bool IsCrossAddressSpaceOrdering = false;
915 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
916 *ScopeOrNone;
917
918 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
919 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
920 reportUnsupported(MI, "Unsupported atomic address space");
921 return std::nullopt;
922 }
923
924 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
925 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
926}
927
928std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
929 const MachineBasicBlock::iterator &MI) const {
930 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
931
932 if (!(MI->mayLoad() && MI->mayStore()))
933 return std::nullopt;
934
935 // Be conservative if there are no memory operands.
936 if (MI->getNumMemOperands() == 0)
937 return SIMemOpInfo();
938
939 return constructFromMIWithMMO(MI);
940}
941
942SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
943 TII = ST.getInstrInfo();
944 IV = getIsaVersion(ST.getCPU());
945 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
946}
947
948bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
949 AMDGPU::CPol::CPol Bit) const {
950 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
951 if (!CPol)
952 return false;
953
954 CPol->setImm(CPol->getImm() | Bit);
955 return true;
956}
957
958/* static */
959std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
960 GCNSubtarget::Generation Generation = ST.getGeneration();
961 if (ST.hasGFX940Insts())
962 return std::make_unique<SIGfx940CacheControl>(ST);
963 if (ST.hasGFX90AInsts())
964 return std::make_unique<SIGfx90ACacheControl>(ST);
965 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
966 return std::make_unique<SIGfx6CacheControl>(ST);
967 if (Generation < AMDGPUSubtarget::GFX10)
968 return std::make_unique<SIGfx7CacheControl>(ST);
969 if (Generation < AMDGPUSubtarget::GFX11)
970 return std::make_unique<SIGfx10CacheControl>(ST);
971 if (Generation < AMDGPUSubtarget::GFX12)
972 return std::make_unique<SIGfx11CacheControl>(ST);
973 return std::make_unique<SIGfx12CacheControl>(ST);
974}
975
976bool SIGfx6CacheControl::enableLoadCacheBypass(
978 SIAtomicScope Scope,
979 SIAtomicAddrSpace AddrSpace) const {
980 assert(MI->mayLoad() && !MI->mayStore());
981 bool Changed = false;
982
983 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
984 switch (Scope) {
985 case SIAtomicScope::SYSTEM:
986 case SIAtomicScope::AGENT:
987 // Set L1 cache policy to MISS_EVICT.
988 // Note: there is no L2 cache bypass policy at the ISA level.
989 Changed |= enableGLCBit(MI);
990 break;
991 case SIAtomicScope::WORKGROUP:
992 case SIAtomicScope::WAVEFRONT:
993 case SIAtomicScope::SINGLETHREAD:
994 // No cache to bypass.
995 break;
996 default:
997 llvm_unreachable("Unsupported synchronization scope");
998 }
999 }
1000
1001 /// The scratch address space does not need the global memory caches
1002 /// to be bypassed as all memory operations by the same thread are
1003 /// sequentially consistent, and no other thread can access scratch
1004 /// memory.
1005
1006 /// Other address spaces do not have a cache.
1007
1008 return Changed;
1009}
1010
1011bool SIGfx6CacheControl::enableStoreCacheBypass(
1013 SIAtomicScope Scope,
1014 SIAtomicAddrSpace AddrSpace) const {
1015 assert(!MI->mayLoad() && MI->mayStore());
1016 bool Changed = false;
1017
1018 /// The L1 cache is write through so does not need to be bypassed. There is no
1019 /// bypass control for the L2 cache at the isa level.
1020
1021 return Changed;
1022}
1023
1024bool SIGfx6CacheControl::enableRMWCacheBypass(
1026 SIAtomicScope Scope,
1027 SIAtomicAddrSpace AddrSpace) const {
1028 assert(MI->mayLoad() && MI->mayStore());
1029 bool Changed = false;
1030
1031 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1032 /// bypassed, and the GLC bit is instead used to indicate if they are
1033 /// return or no-return.
1034 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1035
1036 return Changed;
1037}
1038
1039bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1040 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1041 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1042 // Only handle load and store, not atomic read-modify-write insructions. The
1043 // latter use glc to indicate if the atomic returns a result and so must not
1044 // be used for cache control.
1045 assert(MI->mayLoad() ^ MI->mayStore());
1046
1047 // Only update load and store, not LLVM IR atomic read-modify-write
1048 // instructions. The latter are always marked as volatile so cannot sensibly
1049 // handle it as do not want to pessimize all atomics. Also they do not support
1050 // the nontemporal attribute.
1051 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1052
1053 bool Changed = false;
1054
1055 if (IsVolatile) {
1056 // Set L1 cache policy to be MISS_EVICT for load instructions
1057 // and MISS_LRU for store instructions.
1058 // Note: there is no L2 cache bypass policy at the ISA level.
1059 if (Op == SIMemOp::LOAD)
1060 Changed |= enableGLCBit(MI);
1061
1062 // Ensure operation has completed at system scope to cause all volatile
1063 // operations to be visible outside the program in a global order. Do not
1064 // request cross address space as only the global address space can be
1065 // observable outside the program, so no need to cause a waitcnt for LDS
1066 // address space operations.
1067 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1068 Position::AFTER, AtomicOrdering::Unordered);
1069
1070 return Changed;
1071 }
1072
1073 if (IsNonTemporal) {
1074 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1075 // for both loads and stores, and the L2 cache policy to STREAM.
1076 Changed |= enableGLCBit(MI);
1077 Changed |= enableSLCBit(MI);
1078 return Changed;
1079 }
1080
1081 return Changed;
1082}
1083
1084bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1085 SIAtomicScope Scope,
1086 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1087 bool IsCrossAddrSpaceOrdering, Position Pos,
1088 AtomicOrdering Order) const {
1089 bool Changed = false;
1090
1091 MachineBasicBlock &MBB = *MI->getParent();
1092 DebugLoc DL = MI->getDebugLoc();
1093
1094 if (Pos == Position::AFTER)
1095 ++MI;
1096
1097 bool VMCnt = false;
1098 bool LGKMCnt = false;
1099
1100 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1101 SIAtomicAddrSpace::NONE) {
1102 switch (Scope) {
1103 case SIAtomicScope::SYSTEM:
1104 case SIAtomicScope::AGENT:
1105 VMCnt |= true;
1106 break;
1107 case SIAtomicScope::WORKGROUP:
1108 case SIAtomicScope::WAVEFRONT:
1109 case SIAtomicScope::SINGLETHREAD:
1110 // The L1 cache keeps all memory operations in order for
1111 // wavefronts in the same work-group.
1112 break;
1113 default:
1114 llvm_unreachable("Unsupported synchronization scope");
1115 }
1116 }
1117
1118 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1119 switch (Scope) {
1120 case SIAtomicScope::SYSTEM:
1121 case SIAtomicScope::AGENT:
1122 case SIAtomicScope::WORKGROUP:
1123 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1124 // not needed as LDS operations for all waves are executed in a total
1125 // global ordering as observed by all waves. Required if also
1126 // synchronizing with global/GDS memory as LDS operations could be
1127 // reordered with respect to later global/GDS memory operations of the
1128 // same wave.
1129 LGKMCnt |= IsCrossAddrSpaceOrdering;
1130 break;
1131 case SIAtomicScope::WAVEFRONT:
1132 case SIAtomicScope::SINGLETHREAD:
1133 // The LDS keeps all memory operations in order for
1134 // the same wavefront.
1135 break;
1136 default:
1137 llvm_unreachable("Unsupported synchronization scope");
1138 }
1139 }
1140
1141 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1142 switch (Scope) {
1143 case SIAtomicScope::SYSTEM:
1144 case SIAtomicScope::AGENT:
1145 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1146 // is not needed as GDS operations for all waves are executed in a total
1147 // global ordering as observed by all waves. Required if also
1148 // synchronizing with global/LDS memory as GDS operations could be
1149 // reordered with respect to later global/LDS memory operations of the
1150 // same wave.
1151 LGKMCnt |= IsCrossAddrSpaceOrdering;
1152 break;
1153 case SIAtomicScope::WORKGROUP:
1154 case SIAtomicScope::WAVEFRONT:
1155 case SIAtomicScope::SINGLETHREAD:
1156 // The GDS keeps all memory operations in order for
1157 // the same work-group.
1158 break;
1159 default:
1160 llvm_unreachable("Unsupported synchronization scope");
1161 }
1162 }
1163
1164 if (VMCnt || LGKMCnt) {
1165 unsigned WaitCntImmediate =
1167 VMCnt ? 0 : getVmcntBitMask(IV),
1169 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1170 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1171 .addImm(WaitCntImmediate);
1172 Changed = true;
1173 }
1174
1175 if (Pos == Position::AFTER)
1176 --MI;
1177
1178 return Changed;
1179}
1180
1181bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1182 SIAtomicScope Scope,
1183 SIAtomicAddrSpace AddrSpace,
1184 Position Pos) const {
1185 if (!InsertCacheInv)
1186 return false;
1187
1188 bool Changed = false;
1189
1190 MachineBasicBlock &MBB = *MI->getParent();
1191 DebugLoc DL = MI->getDebugLoc();
1192
1193 if (Pos == Position::AFTER)
1194 ++MI;
1195
1196 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1197 switch (Scope) {
1198 case SIAtomicScope::SYSTEM:
1199 case SIAtomicScope::AGENT:
1200 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1201 Changed = true;
1202 break;
1203 case SIAtomicScope::WORKGROUP:
1204 case SIAtomicScope::WAVEFRONT:
1205 case SIAtomicScope::SINGLETHREAD:
1206 // No cache to invalidate.
1207 break;
1208 default:
1209 llvm_unreachable("Unsupported synchronization scope");
1210 }
1211 }
1212
1213 /// The scratch address space does not need the global memory cache
1214 /// to be flushed as all memory operations by the same thread are
1215 /// sequentially consistent, and no other thread can access scratch
1216 /// memory.
1217
1218 /// Other address spaces do not have a cache.
1219
1220 if (Pos == Position::AFTER)
1221 --MI;
1222
1223 return Changed;
1224}
1225
1226bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1227 SIAtomicScope Scope,
1228 SIAtomicAddrSpace AddrSpace,
1229 bool IsCrossAddrSpaceOrdering,
1230 Position Pos) const {
1231 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1232 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1233}
1234
1235bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1236 SIAtomicScope Scope,
1237 SIAtomicAddrSpace AddrSpace,
1238 Position Pos) const {
1239 if (!InsertCacheInv)
1240 return false;
1241
1242 bool Changed = false;
1243
1244 MachineBasicBlock &MBB = *MI->getParent();
1245 DebugLoc DL = MI->getDebugLoc();
1246
1248
1249 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1250 ? AMDGPU::BUFFER_WBINVL1
1251 : AMDGPU::BUFFER_WBINVL1_VOL;
1252
1253 if (Pos == Position::AFTER)
1254 ++MI;
1255
1256 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1257 switch (Scope) {
1258 case SIAtomicScope::SYSTEM:
1259 case SIAtomicScope::AGENT:
1260 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1261 Changed = true;
1262 break;
1263 case SIAtomicScope::WORKGROUP:
1264 case SIAtomicScope::WAVEFRONT:
1265 case SIAtomicScope::SINGLETHREAD:
1266 // No cache to invalidate.
1267 break;
1268 default:
1269 llvm_unreachable("Unsupported synchronization scope");
1270 }
1271 }
1272
1273 /// The scratch address space does not need the global memory cache
1274 /// to be flushed as all memory operations by the same thread are
1275 /// sequentially consistent, and no other thread can access scratch
1276 /// memory.
1277
1278 /// Other address spaces do not have a cache.
1279
1280 if (Pos == Position::AFTER)
1281 --MI;
1282
1283 return Changed;
1284}
1285
1286bool SIGfx90ACacheControl::enableLoadCacheBypass(
1288 SIAtomicScope Scope,
1289 SIAtomicAddrSpace AddrSpace) const {
1290 assert(MI->mayLoad() && !MI->mayStore());
1291 bool Changed = false;
1292
1293 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1294 switch (Scope) {
1295 case SIAtomicScope::SYSTEM:
1296 case SIAtomicScope::AGENT:
1297 // Set the L1 cache policy to MISS_LRU.
1298 // Note: there is no L2 cache bypass policy at the ISA level.
1299 Changed |= enableGLCBit(MI);
1300 break;
1301 case SIAtomicScope::WORKGROUP:
1302 // In threadgroup split mode the waves of a work-group can be executing on
1303 // different CUs. Therefore need to bypass the L1 which is per CU.
1304 // Otherwise in non-threadgroup split mode all waves of a work-group are
1305 // on the same CU, and so the L1 does not need to be bypassed.
1306 if (ST.isTgSplitEnabled())
1307 Changed |= enableGLCBit(MI);
1308 break;
1309 case SIAtomicScope::WAVEFRONT:
1310 case SIAtomicScope::SINGLETHREAD:
1311 // No cache to bypass.
1312 break;
1313 default:
1314 llvm_unreachable("Unsupported synchronization scope");
1315 }
1316 }
1317
1318 /// The scratch address space does not need the global memory caches
1319 /// to be bypassed as all memory operations by the same thread are
1320 /// sequentially consistent, and no other thread can access scratch
1321 /// memory.
1322
1323 /// Other address spaces do not have a cache.
1324
1325 return Changed;
1326}
1327
1328bool SIGfx90ACacheControl::enableStoreCacheBypass(
1330 SIAtomicScope Scope,
1331 SIAtomicAddrSpace AddrSpace) const {
1332 assert(!MI->mayLoad() && MI->mayStore());
1333 bool Changed = false;
1334
1335 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1336 switch (Scope) {
1337 case SIAtomicScope::SYSTEM:
1338 case SIAtomicScope::AGENT:
1339 /// Do not set glc for store atomic operations as they implicitly write
1340 /// through the L1 cache.
1341 break;
1342 case SIAtomicScope::WORKGROUP:
1343 case SIAtomicScope::WAVEFRONT:
1344 case SIAtomicScope::SINGLETHREAD:
1345 // No cache to bypass. Store atomics implicitly write through the L1
1346 // cache.
1347 break;
1348 default:
1349 llvm_unreachable("Unsupported synchronization scope");
1350 }
1351 }
1352
1353 /// The scratch address space does not need the global memory caches
1354 /// to be bypassed as all memory operations by the same thread are
1355 /// sequentially consistent, and no other thread can access scratch
1356 /// memory.
1357
1358 /// Other address spaces do not have a cache.
1359
1360 return Changed;
1361}
1362
1363bool SIGfx90ACacheControl::enableRMWCacheBypass(
1365 SIAtomicScope Scope,
1366 SIAtomicAddrSpace AddrSpace) const {
1367 assert(MI->mayLoad() && MI->mayStore());
1368 bool Changed = false;
1369
1370 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1371 switch (Scope) {
1372 case SIAtomicScope::SYSTEM:
1373 case SIAtomicScope::AGENT:
1374 /// Do not set glc for RMW atomic operations as they implicitly bypass
1375 /// the L1 cache, and the glc bit is instead used to indicate if they are
1376 /// return or no-return.
1377 break;
1378 case SIAtomicScope::WORKGROUP:
1379 case SIAtomicScope::WAVEFRONT:
1380 case SIAtomicScope::SINGLETHREAD:
1381 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1382 break;
1383 default:
1384 llvm_unreachable("Unsupported synchronization scope");
1385 }
1386 }
1387
1388 return Changed;
1389}
1390
1391bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1392 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1393 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1394 // Only handle load and store, not atomic read-modify-write insructions. The
1395 // latter use glc to indicate if the atomic returns a result and so must not
1396 // be used for cache control.
1397 assert(MI->mayLoad() ^ MI->mayStore());
1398
1399 // Only update load and store, not LLVM IR atomic read-modify-write
1400 // instructions. The latter are always marked as volatile so cannot sensibly
1401 // handle it as do not want to pessimize all atomics. Also they do not support
1402 // the nontemporal attribute.
1403 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1404
1405 bool Changed = false;
1406
1407 if (IsVolatile) {
1408 // Set L1 cache policy to be MISS_EVICT for load instructions
1409 // and MISS_LRU for store instructions.
1410 // Note: there is no L2 cache bypass policy at the ISA level.
1411 if (Op == SIMemOp::LOAD)
1412 Changed |= enableGLCBit(MI);
1413
1414 // Ensure operation has completed at system scope to cause all volatile
1415 // operations to be visible outside the program in a global order. Do not
1416 // request cross address space as only the global address space can be
1417 // observable outside the program, so no need to cause a waitcnt for LDS
1418 // address space operations.
1419 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1420 Position::AFTER, AtomicOrdering::Unordered);
1421
1422 return Changed;
1423 }
1424
1425 if (IsNonTemporal) {
1426 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1427 // for both loads and stores, and the L2 cache policy to STREAM.
1428 Changed |= enableGLCBit(MI);
1429 Changed |= enableSLCBit(MI);
1430 return Changed;
1431 }
1432
1433 return Changed;
1434}
1435
1436bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1437 SIAtomicScope Scope,
1438 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1439 bool IsCrossAddrSpaceOrdering,
1440 Position Pos,
1441 AtomicOrdering Order) const {
1442 if (ST.isTgSplitEnabled()) {
1443 // In threadgroup split mode the waves of a work-group can be executing on
1444 // different CUs. Therefore need to wait for global or GDS memory operations
1445 // to complete to ensure they are visible to waves in the other CUs.
1446 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1447 // the same CU, so no need to wait for global memory as all waves in the
1448 // work-group access the same the L1, nor wait for GDS as access are ordered
1449 // on a CU.
1450 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1451 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1452 (Scope == SIAtomicScope::WORKGROUP)) {
1453 // Same as GFX7 using agent scope.
1454 Scope = SIAtomicScope::AGENT;
1455 }
1456 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1457 // LDS memory operations.
1458 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1459 }
1460 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1461 IsCrossAddrSpaceOrdering, Pos, Order);
1462}
1463
1464bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1465 SIAtomicScope Scope,
1466 SIAtomicAddrSpace AddrSpace,
1467 Position Pos) const {
1468 if (!InsertCacheInv)
1469 return false;
1470
1471 bool Changed = false;
1472
1473 MachineBasicBlock &MBB = *MI->getParent();
1474 DebugLoc DL = MI->getDebugLoc();
1475
1476 if (Pos == Position::AFTER)
1477 ++MI;
1478
1479 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1480 switch (Scope) {
1481 case SIAtomicScope::SYSTEM:
1482 // Ensures that following loads will not see stale remote VMEM data or
1483 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1484 // CC will never be stale due to the local memory probes.
1485 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1486 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1487 // hardware does not reorder memory operations by the same wave with
1488 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1489 // remove any cache lines of earlier writes by the same wave and ensures
1490 // later reads by the same wave will refetch the cache lines.
1491 Changed = true;
1492 break;
1493 case SIAtomicScope::AGENT:
1494 // Same as GFX7.
1495 break;
1496 case SIAtomicScope::WORKGROUP:
1497 // In threadgroup split mode the waves of a work-group can be executing on
1498 // different CUs. Therefore need to invalidate the L1 which is per CU.
1499 // Otherwise in non-threadgroup split mode all waves of a work-group are
1500 // on the same CU, and so the L1 does not need to be invalidated.
1501 if (ST.isTgSplitEnabled()) {
1502 // Same as GFX7 using agent scope.
1503 Scope = SIAtomicScope::AGENT;
1504 }
1505 break;
1506 case SIAtomicScope::WAVEFRONT:
1507 case SIAtomicScope::SINGLETHREAD:
1508 // Same as GFX7.
1509 break;
1510 default:
1511 llvm_unreachable("Unsupported synchronization scope");
1512 }
1513 }
1514
1515 /// The scratch address space does not need the global memory cache
1516 /// to be flushed as all memory operations by the same thread are
1517 /// sequentially consistent, and no other thread can access scratch
1518 /// memory.
1519
1520 /// Other address spaces do not have a cache.
1521
1522 if (Pos == Position::AFTER)
1523 --MI;
1524
1525 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1526
1527 return Changed;
1528}
1529
1530bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1531 SIAtomicScope Scope,
1532 SIAtomicAddrSpace AddrSpace,
1533 bool IsCrossAddrSpaceOrdering,
1534 Position Pos) const {
1535 bool Changed = false;
1536
1537 MachineBasicBlock &MBB = *MI->getParent();
1538 const DebugLoc &DL = MI->getDebugLoc();
1539
1540 if (Pos == Position::AFTER)
1541 ++MI;
1542
1543 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1544 switch (Scope) {
1545 case SIAtomicScope::SYSTEM:
1546 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1547 // hardware does not reorder memory operations by the same wave with
1548 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1549 // to initiate writeback of any dirty cache lines of earlier writes by the
1550 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1551 // writeback has completed.
1552 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1553 // Set SC bits to indicate system scope.
1555 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1556 // vmcnt(0)" needed by the "BUFFER_WBL2".
1557 Changed = true;
1558 break;
1559 case SIAtomicScope::AGENT:
1560 case SIAtomicScope::WORKGROUP:
1561 case SIAtomicScope::WAVEFRONT:
1562 case SIAtomicScope::SINGLETHREAD:
1563 // Same as GFX7.
1564 break;
1565 default:
1566 llvm_unreachable("Unsupported synchronization scope");
1567 }
1568 }
1569
1570 if (Pos == Position::AFTER)
1571 --MI;
1572
1573 Changed |=
1574 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1575 IsCrossAddrSpaceOrdering, Pos);
1576
1577 return Changed;
1578}
1579
1580bool SIGfx940CacheControl::enableLoadCacheBypass(
1581 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1582 SIAtomicAddrSpace AddrSpace) const {
1583 assert(MI->mayLoad() && !MI->mayStore());
1584 bool Changed = false;
1585
1586 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1587 switch (Scope) {
1588 case SIAtomicScope::SYSTEM:
1589 // Set SC bits to indicate system scope.
1590 Changed |= enableSC0Bit(MI);
1591 Changed |= enableSC1Bit(MI);
1592 break;
1593 case SIAtomicScope::AGENT:
1594 // Set SC bits to indicate agent scope.
1595 Changed |= enableSC1Bit(MI);
1596 break;
1597 case SIAtomicScope::WORKGROUP:
1598 // In threadgroup split mode the waves of a work-group can be executing on
1599 // different CUs. Therefore need to bypass the L1 which is per CU.
1600 // Otherwise in non-threadgroup split mode all waves of a work-group are
1601 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1602 // bits to indicate work-group scope will do this automatically.
1603 Changed |= enableSC0Bit(MI);
1604 break;
1605 case SIAtomicScope::WAVEFRONT:
1606 case SIAtomicScope::SINGLETHREAD:
1607 // Leave SC bits unset to indicate wavefront scope.
1608 break;
1609 default:
1610 llvm_unreachable("Unsupported synchronization scope");
1611 }
1612 }
1613
1614 /// The scratch address space does not need the global memory caches
1615 /// to be bypassed as all memory operations by the same thread are
1616 /// sequentially consistent, and no other thread can access scratch
1617 /// memory.
1618
1619 /// Other address spaces do not have a cache.
1620
1621 return Changed;
1622}
1623
1624bool SIGfx940CacheControl::enableStoreCacheBypass(
1626 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1627 assert(!MI->mayLoad() && MI->mayStore());
1628 bool Changed = false;
1629
1630 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1631 switch (Scope) {
1632 case SIAtomicScope::SYSTEM:
1633 // Set SC bits to indicate system scope.
1634 Changed |= enableSC0Bit(MI);
1635 Changed |= enableSC1Bit(MI);
1636 break;
1637 case SIAtomicScope::AGENT:
1638 // Set SC bits to indicate agent scope.
1639 Changed |= enableSC1Bit(MI);
1640 break;
1641 case SIAtomicScope::WORKGROUP:
1642 // Set SC bits to indicate workgroup scope.
1643 Changed |= enableSC0Bit(MI);
1644 break;
1645 case SIAtomicScope::WAVEFRONT:
1646 case SIAtomicScope::SINGLETHREAD:
1647 // Leave SC bits unset to indicate wavefront scope.
1648 break;
1649 default:
1650 llvm_unreachable("Unsupported synchronization scope");
1651 }
1652 }
1653
1654 /// The scratch address space does not need the global memory caches
1655 /// to be bypassed as all memory operations by the same thread are
1656 /// sequentially consistent, and no other thread can access scratch
1657 /// memory.
1658
1659 /// Other address spaces do not have a cache.
1660
1661 return Changed;
1662}
1663
1664bool SIGfx940CacheControl::enableRMWCacheBypass(
1665 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1666 SIAtomicAddrSpace AddrSpace) const {
1667 assert(MI->mayLoad() && MI->mayStore());
1668 bool Changed = false;
1669
1670 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1671 switch (Scope) {
1672 case SIAtomicScope::SYSTEM:
1673 // Set SC1 bit to indicate system scope.
1674 Changed |= enableSC1Bit(MI);
1675 break;
1676 case SIAtomicScope::AGENT:
1677 case SIAtomicScope::WORKGROUP:
1678 case SIAtomicScope::WAVEFRONT:
1679 case SIAtomicScope::SINGLETHREAD:
1680 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1681 // to indicate system or agent scope. The SC0 bit is used to indicate if
1682 // they are return or no-return. Leave SC1 bit unset to indicate agent
1683 // scope.
1684 break;
1685 default:
1686 llvm_unreachable("Unsupported synchronization scope");
1687 }
1688 }
1689
1690 return Changed;
1691}
1692
1693bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1694 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1695 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1696 // Only handle load and store, not atomic read-modify-write insructions. The
1697 // latter use glc to indicate if the atomic returns a result and so must not
1698 // be used for cache control.
1699 assert(MI->mayLoad() ^ MI->mayStore());
1700
1701 // Only update load and store, not LLVM IR atomic read-modify-write
1702 // instructions. The latter are always marked as volatile so cannot sensibly
1703 // handle it as do not want to pessimize all atomics. Also they do not support
1704 // the nontemporal attribute.
1705 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1706
1707 bool Changed = false;
1708
1709 if (IsVolatile) {
1710 // Set SC bits to indicate system scope.
1711 Changed |= enableSC0Bit(MI);
1712 Changed |= enableSC1Bit(MI);
1713
1714 // Ensure operation has completed at system scope to cause all volatile
1715 // operations to be visible outside the program in a global order. Do not
1716 // request cross address space as only the global address space can be
1717 // observable outside the program, so no need to cause a waitcnt for LDS
1718 // address space operations.
1719 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1720 Position::AFTER, AtomicOrdering::Unordered);
1721
1722 return Changed;
1723 }
1724
1725 if (IsNonTemporal) {
1726 Changed |= enableNTBit(MI);
1727 return Changed;
1728 }
1729
1730 return Changed;
1731}
1732
1733bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1734 SIAtomicScope Scope,
1735 SIAtomicAddrSpace AddrSpace,
1736 Position Pos) const {
1737 if (!InsertCacheInv)
1738 return false;
1739
1740 bool Changed = false;
1741
1742 MachineBasicBlock &MBB = *MI->getParent();
1743 DebugLoc DL = MI->getDebugLoc();
1744
1745 if (Pos == Position::AFTER)
1746 ++MI;
1747
1748 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1749 switch (Scope) {
1750 case SIAtomicScope::SYSTEM:
1751 // Ensures that following loads will not see stale remote VMEM data or
1752 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1753 // CC will never be stale due to the local memory probes.
1754 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1755 // Set SC bits to indicate system scope.
1757 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1758 // hardware does not reorder memory operations by the same wave with
1759 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1760 // remove any cache lines of earlier writes by the same wave and ensures
1761 // later reads by the same wave will refetch the cache lines.
1762 Changed = true;
1763 break;
1764 case SIAtomicScope::AGENT:
1765 // Ensures that following loads will not see stale remote date or local
1766 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1767 // due to the memory probes.
1768 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1769 // Set SC bits to indicate agent scope.
1771 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1772 // does not reorder memory operations with respect to preceeding buffer
1773 // invalidate. The invalidate is guaranteed to remove any cache lines of
1774 // earlier writes and ensures later writes will refetch the cache lines.
1775 Changed = true;
1776 break;
1777 case SIAtomicScope::WORKGROUP:
1778 // In threadgroup split mode the waves of a work-group can be executing on
1779 // different CUs. Therefore need to invalidate the L1 which is per CU.
1780 // Otherwise in non-threadgroup split mode all waves of a work-group are
1781 // on the same CU, and so the L1 does not need to be invalidated.
1782 if (ST.isTgSplitEnabled()) {
1783 // Ensures L1 is invalidated if in threadgroup split mode. In
1784 // non-threadgroup split mode it is a NOP, but no point generating it in
1785 // that case if know not in that mode.
1786 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1787 // Set SC bits to indicate work-group scope.
1789 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1790 // does not reorder memory operations with respect to preceeding buffer
1791 // invalidate. The invalidate is guaranteed to remove any cache lines of
1792 // earlier writes and ensures later writes will refetch the cache lines.
1793 Changed = true;
1794 }
1795 break;
1796 case SIAtomicScope::WAVEFRONT:
1797 case SIAtomicScope::SINGLETHREAD:
1798 // Could generate "BUFFER_INV" but it would do nothing as there are no
1799 // caches to invalidate.
1800 break;
1801 default:
1802 llvm_unreachable("Unsupported synchronization scope");
1803 }
1804 }
1805
1806 /// The scratch address space does not need the global memory cache
1807 /// to be flushed as all memory operations by the same thread are
1808 /// sequentially consistent, and no other thread can access scratch
1809 /// memory.
1810
1811 /// Other address spaces do not have a cache.
1812
1813 if (Pos == Position::AFTER)
1814 --MI;
1815
1816 return Changed;
1817}
1818
1819bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1820 SIAtomicScope Scope,
1821 SIAtomicAddrSpace AddrSpace,
1822 bool IsCrossAddrSpaceOrdering,
1823 Position Pos) const {
1824 bool Changed = false;
1825
1826 MachineBasicBlock &MBB = *MI->getParent();
1827 DebugLoc DL = MI->getDebugLoc();
1828
1829 if (Pos == Position::AFTER)
1830 ++MI;
1831
1832 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1833 switch (Scope) {
1834 case SIAtomicScope::SYSTEM:
1835 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1836 // hardware does not reorder memory operations by the same wave with
1837 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1838 // to initiate writeback of any dirty cache lines of earlier writes by the
1839 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1840 // writeback has completed.
1841 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1842 // Set SC bits to indicate system scope.
1844 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1845 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1846 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1847 Changed = true;
1848 break;
1849 case SIAtomicScope::AGENT:
1850 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1851 // Set SC bits to indicate agent scope.
1853
1854 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1855 // SIAtomicScope::AGENT, the following insertWait will generate the
1856 // required "S_WAITCNT vmcnt(0)".
1857 Changed = true;
1858 break;
1859 case SIAtomicScope::WORKGROUP:
1860 case SIAtomicScope::WAVEFRONT:
1861 case SIAtomicScope::SINGLETHREAD:
1862 // Do not generate "BUFFER_WBL2" as there are no caches it would
1863 // writeback, and would require an otherwise unnecessary
1864 // "S_WAITCNT vmcnt(0)".
1865 break;
1866 default:
1867 llvm_unreachable("Unsupported synchronization scope");
1868 }
1869 }
1870
1871 if (Pos == Position::AFTER)
1872 --MI;
1873
1874 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1875 // S_WAITCNT needed.
1876 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1877 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1878
1879 return Changed;
1880}
1881
1882bool SIGfx10CacheControl::enableLoadCacheBypass(
1884 SIAtomicScope Scope,
1885 SIAtomicAddrSpace AddrSpace) const {
1886 assert(MI->mayLoad() && !MI->mayStore());
1887 bool Changed = false;
1888
1889 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1890 switch (Scope) {
1891 case SIAtomicScope::SYSTEM:
1892 case SIAtomicScope::AGENT:
1893 // Set the L0 and L1 cache policies to MISS_EVICT.
1894 // Note: there is no L2 cache coherent bypass control at the ISA level.
1895 Changed |= enableGLCBit(MI);
1896 Changed |= enableDLCBit(MI);
1897 break;
1898 case SIAtomicScope::WORKGROUP:
1899 // In WGP mode the waves of a work-group can be executing on either CU of
1900 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1901 // CU mode all waves of a work-group are on the same CU, and so the L0
1902 // does not need to be bypassed.
1903 if (!ST.isCuModeEnabled())
1904 Changed |= enableGLCBit(MI);
1905 break;
1906 case SIAtomicScope::WAVEFRONT:
1907 case SIAtomicScope::SINGLETHREAD:
1908 // No cache to bypass.
1909 break;
1910 default:
1911 llvm_unreachable("Unsupported synchronization scope");
1912 }
1913 }
1914
1915 /// The scratch address space does not need the global memory caches
1916 /// to be bypassed as all memory operations by the same thread are
1917 /// sequentially consistent, and no other thread can access scratch
1918 /// memory.
1919
1920 /// Other address spaces do not have a cache.
1921
1922 return Changed;
1923}
1924
1925bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1926 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1927 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1928
1929 // Only handle load and store, not atomic read-modify-write insructions. The
1930 // latter use glc to indicate if the atomic returns a result and so must not
1931 // be used for cache control.
1932 assert(MI->mayLoad() ^ MI->mayStore());
1933
1934 // Only update load and store, not LLVM IR atomic read-modify-write
1935 // instructions. The latter are always marked as volatile so cannot sensibly
1936 // handle it as do not want to pessimize all atomics. Also they do not support
1937 // the nontemporal attribute.
1938 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1939
1940 bool Changed = false;
1941
1942 if (IsVolatile) {
1943 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1944 // and MISS_LRU for store instructions.
1945 // Note: there is no L2 cache coherent bypass control at the ISA level.
1946 if (Op == SIMemOp::LOAD) {
1947 Changed |= enableGLCBit(MI);
1948 Changed |= enableDLCBit(MI);
1949 }
1950
1951 // Ensure operation has completed at system scope to cause all volatile
1952 // operations to be visible outside the program in a global order. Do not
1953 // request cross address space as only the global address space can be
1954 // observable outside the program, so no need to cause a waitcnt for LDS
1955 // address space operations.
1956 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1957 Position::AFTER, AtomicOrdering::Unordered);
1958 return Changed;
1959 }
1960
1961 if (IsNonTemporal) {
1962 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1963 // and L2 cache policy to STREAM.
1964 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1965 // to MISS_EVICT and the L2 cache policy to STREAM.
1966 if (Op == SIMemOp::STORE)
1967 Changed |= enableGLCBit(MI);
1968 Changed |= enableSLCBit(MI);
1969
1970 return Changed;
1971 }
1972
1973 return Changed;
1974}
1975
1976bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1977 SIAtomicScope Scope,
1978 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1979 bool IsCrossAddrSpaceOrdering,
1980 Position Pos, AtomicOrdering Order) const {
1981 bool Changed = false;
1982
1983 MachineBasicBlock &MBB = *MI->getParent();
1984 DebugLoc DL = MI->getDebugLoc();
1985
1986 if (Pos == Position::AFTER)
1987 ++MI;
1988
1989 bool VMCnt = false;
1990 bool VSCnt = false;
1991 bool LGKMCnt = false;
1992
1993 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1994 SIAtomicAddrSpace::NONE) {
1995 switch (Scope) {
1996 case SIAtomicScope::SYSTEM:
1997 case SIAtomicScope::AGENT:
1998 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1999 VMCnt |= true;
2000 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2001 VSCnt |= true;
2002 break;
2003 case SIAtomicScope::WORKGROUP:
2004 // In WGP mode the waves of a work-group can be executing on either CU of
2005 // the WGP. Therefore need to wait for operations to complete to ensure
2006 // they are visible to waves in the other CU as the L0 is per CU.
2007 // Otherwise in CU mode and all waves of a work-group are on the same CU
2008 // which shares the same L0.
2009 if (!ST.isCuModeEnabled()) {
2010 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2011 VMCnt |= true;
2012 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2013 VSCnt |= true;
2014 }
2015 break;
2016 case SIAtomicScope::WAVEFRONT:
2017 case SIAtomicScope::SINGLETHREAD:
2018 // The L0 cache keeps all memory operations in order for
2019 // work-items in the same wavefront.
2020 break;
2021 default:
2022 llvm_unreachable("Unsupported synchronization scope");
2023 }
2024 }
2025
2026 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2027 switch (Scope) {
2028 case SIAtomicScope::SYSTEM:
2029 case SIAtomicScope::AGENT:
2030 case SIAtomicScope::WORKGROUP:
2031 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2032 // not needed as LDS operations for all waves are executed in a total
2033 // global ordering as observed by all waves. Required if also
2034 // synchronizing with global/GDS memory as LDS operations could be
2035 // reordered with respect to later global/GDS memory operations of the
2036 // same wave.
2037 LGKMCnt |= IsCrossAddrSpaceOrdering;
2038 break;
2039 case SIAtomicScope::WAVEFRONT:
2040 case SIAtomicScope::SINGLETHREAD:
2041 // The LDS keeps all memory operations in order for
2042 // the same wavefront.
2043 break;
2044 default:
2045 llvm_unreachable("Unsupported synchronization scope");
2046 }
2047 }
2048
2049 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2050 switch (Scope) {
2051 case SIAtomicScope::SYSTEM:
2052 case SIAtomicScope::AGENT:
2053 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2054 // is not needed as GDS operations for all waves are executed in a total
2055 // global ordering as observed by all waves. Required if also
2056 // synchronizing with global/LDS memory as GDS operations could be
2057 // reordered with respect to later global/LDS memory operations of the
2058 // same wave.
2059 LGKMCnt |= IsCrossAddrSpaceOrdering;
2060 break;
2061 case SIAtomicScope::WORKGROUP:
2062 case SIAtomicScope::WAVEFRONT:
2063 case SIAtomicScope::SINGLETHREAD:
2064 // The GDS keeps all memory operations in order for
2065 // the same work-group.
2066 break;
2067 default:
2068 llvm_unreachable("Unsupported synchronization scope");
2069 }
2070 }
2071
2072 if (VMCnt || LGKMCnt) {
2073 unsigned WaitCntImmediate =
2075 VMCnt ? 0 : getVmcntBitMask(IV),
2077 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2078 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2079 .addImm(WaitCntImmediate);
2080 Changed = true;
2081 }
2082
2083 if (VSCnt) {
2084 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2085 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2086 .addImm(0);
2087 Changed = true;
2088 }
2089
2090 if (Pos == Position::AFTER)
2091 --MI;
2092
2093 return Changed;
2094}
2095
2096bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2097 SIAtomicScope Scope,
2098 SIAtomicAddrSpace AddrSpace,
2099 Position Pos) const {
2100 if (!InsertCacheInv)
2101 return false;
2102
2103 bool Changed = false;
2104
2105 MachineBasicBlock &MBB = *MI->getParent();
2106 DebugLoc DL = MI->getDebugLoc();
2107
2108 if (Pos == Position::AFTER)
2109 ++MI;
2110
2111 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2112 switch (Scope) {
2113 case SIAtomicScope::SYSTEM:
2114 case SIAtomicScope::AGENT:
2115 // The order of invalidates matter here. We must invalidate "outer in"
2116 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2117 // invalidated.
2118 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2119 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2120 Changed = true;
2121 break;
2122 case SIAtomicScope::WORKGROUP:
2123 // In WGP mode the waves of a work-group can be executing on either CU of
2124 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2125 // in CU mode and all waves of a work-group are on the same CU, and so the
2126 // L0 does not need to be invalidated.
2127 if (!ST.isCuModeEnabled()) {
2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2129 Changed = true;
2130 }
2131 break;
2132 case SIAtomicScope::WAVEFRONT:
2133 case SIAtomicScope::SINGLETHREAD:
2134 // No cache to invalidate.
2135 break;
2136 default:
2137 llvm_unreachable("Unsupported synchronization scope");
2138 }
2139 }
2140
2141 /// The scratch address space does not need the global memory cache
2142 /// to be flushed as all memory operations by the same thread are
2143 /// sequentially consistent, and no other thread can access scratch
2144 /// memory.
2145
2146 /// Other address spaces do not have a cache.
2147
2148 if (Pos == Position::AFTER)
2149 --MI;
2150
2151 return Changed;
2152}
2153
2154bool SIGfx11CacheControl::enableLoadCacheBypass(
2155 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2156 SIAtomicAddrSpace AddrSpace) const {
2157 assert(MI->mayLoad() && !MI->mayStore());
2158 bool Changed = false;
2159
2160 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2161 switch (Scope) {
2162 case SIAtomicScope::SYSTEM:
2163 case SIAtomicScope::AGENT:
2164 // Set the L0 and L1 cache policies to MISS_EVICT.
2165 // Note: there is no L2 cache coherent bypass control at the ISA level.
2166 Changed |= enableGLCBit(MI);
2167 break;
2168 case SIAtomicScope::WORKGROUP:
2169 // In WGP mode the waves of a work-group can be executing on either CU of
2170 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2171 // CU mode all waves of a work-group are on the same CU, and so the L0
2172 // does not need to be bypassed.
2173 if (!ST.isCuModeEnabled())
2174 Changed |= enableGLCBit(MI);
2175 break;
2176 case SIAtomicScope::WAVEFRONT:
2177 case SIAtomicScope::SINGLETHREAD:
2178 // No cache to bypass.
2179 break;
2180 default:
2181 llvm_unreachable("Unsupported synchronization scope");
2182 }
2183 }
2184
2185 /// The scratch address space does not need the global memory caches
2186 /// to be bypassed as all memory operations by the same thread are
2187 /// sequentially consistent, and no other thread can access scratch
2188 /// memory.
2189
2190 /// Other address spaces do not have a cache.
2191
2192 return Changed;
2193}
2194
2195bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2196 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2197 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2198
2199 // Only handle load and store, not atomic read-modify-write insructions. The
2200 // latter use glc to indicate if the atomic returns a result and so must not
2201 // be used for cache control.
2202 assert(MI->mayLoad() ^ MI->mayStore());
2203
2204 // Only update load and store, not LLVM IR atomic read-modify-write
2205 // instructions. The latter are always marked as volatile so cannot sensibly
2206 // handle it as do not want to pessimize all atomics. Also they do not support
2207 // the nontemporal attribute.
2208 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2209
2210 bool Changed = false;
2211
2212 if (IsVolatile) {
2213 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2214 // and MISS_LRU for store instructions.
2215 // Note: there is no L2 cache coherent bypass control at the ISA level.
2216 if (Op == SIMemOp::LOAD)
2217 Changed |= enableGLCBit(MI);
2218
2219 // Set MALL NOALLOC for load and store instructions.
2220 Changed |= enableDLCBit(MI);
2221
2222 // Ensure operation has completed at system scope to cause all volatile
2223 // operations to be visible outside the program in a global order. Do not
2224 // request cross address space as only the global address space can be
2225 // observable outside the program, so no need to cause a waitcnt for LDS
2226 // address space operations.
2227 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2228 Position::AFTER, AtomicOrdering::Unordered);
2229 return Changed;
2230 }
2231
2232 if (IsNonTemporal) {
2233 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2234 // and L2 cache policy to STREAM.
2235 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2236 // to MISS_EVICT and the L2 cache policy to STREAM.
2237 if (Op == SIMemOp::STORE)
2238 Changed |= enableGLCBit(MI);
2239 Changed |= enableSLCBit(MI);
2240
2241 // Set MALL NOALLOC for load and store instructions.
2242 Changed |= enableDLCBit(MI);
2243 return Changed;
2244 }
2245
2246 return Changed;
2247}
2248
2249bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2250 AMDGPU::CPol::CPol Value) const {
2251 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2252 if (!CPol)
2253 return false;
2254
2256 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2257 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2258 return true;
2259 }
2260
2261 return false;
2262}
2263
2264bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2265 AMDGPU::CPol::CPol Value) const {
2266 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2267 if (!CPol)
2268 return false;
2269
2270 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2271 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2272 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2273 return true;
2274 }
2275
2276 return false;
2277}
2278
2279bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2280 const MachineBasicBlock::iterator MI) const {
2281 // TODO: implement flag for frontend to give us a hint not to insert waits.
2282
2283 MachineBasicBlock &MBB = *MI->getParent();
2284 const DebugLoc &DL = MI->getDebugLoc();
2285
2286 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2287 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2288 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2289 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2290 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2291
2292 return true;
2293}
2294
2295bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2296 SIAtomicScope Scope,
2297 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2298 bool IsCrossAddrSpaceOrdering,
2299 Position Pos, AtomicOrdering Order) const {
2300 bool Changed = false;
2301
2302 MachineBasicBlock &MBB = *MI->getParent();
2303 DebugLoc DL = MI->getDebugLoc();
2304
2305 bool LOADCnt = false;
2306 bool DSCnt = false;
2307 bool STORECnt = false;
2308
2309 if (Pos == Position::AFTER)
2310 ++MI;
2311
2312 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2313 SIAtomicAddrSpace::NONE) {
2314 switch (Scope) {
2315 case SIAtomicScope::SYSTEM:
2316 case SIAtomicScope::AGENT:
2317 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2318 LOADCnt |= true;
2319 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2320 STORECnt |= true;
2321 break;
2322 case SIAtomicScope::WORKGROUP:
2323 // In WGP mode the waves of a work-group can be executing on either CU of
2324 // the WGP. Therefore need to wait for operations to complete to ensure
2325 // they are visible to waves in the other CU as the L0 is per CU.
2326 // Otherwise in CU mode and all waves of a work-group are on the same CU
2327 // which shares the same L0.
2328 if (!ST.isCuModeEnabled()) {
2329 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2330 LOADCnt |= true;
2331 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2332 STORECnt |= true;
2333 }
2334 break;
2335 case SIAtomicScope::WAVEFRONT:
2336 case SIAtomicScope::SINGLETHREAD:
2337 // The L0 cache keeps all memory operations in order for
2338 // work-items in the same wavefront.
2339 break;
2340 default:
2341 llvm_unreachable("Unsupported synchronization scope");
2342 }
2343 }
2344
2345 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2346 switch (Scope) {
2347 case SIAtomicScope::SYSTEM:
2348 case SIAtomicScope::AGENT:
2349 case SIAtomicScope::WORKGROUP:
2350 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2351 // not needed as LDS operations for all waves are executed in a total
2352 // global ordering as observed by all waves. Required if also
2353 // synchronizing with global/GDS memory as LDS operations could be
2354 // reordered with respect to later global/GDS memory operations of the
2355 // same wave.
2356 DSCnt |= IsCrossAddrSpaceOrdering;
2357 break;
2358 case SIAtomicScope::WAVEFRONT:
2359 case SIAtomicScope::SINGLETHREAD:
2360 // The LDS keeps all memory operations in order for
2361 // the same wavefront.
2362 break;
2363 default:
2364 llvm_unreachable("Unsupported synchronization scope");
2365 }
2366 }
2367
2368 if (LOADCnt) {
2369 // Acquire sequences only need to wait on the previous atomic operation.
2370 // e.g. a typical sequence looks like
2371 // atomic load
2372 // (wait)
2373 // global_inv
2374 //
2375 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2376 // to be tracked using loadcnt.
2377 //
2378 // This also applies to fences. Fences cannot pair with an instruction
2379 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2380 if (Order != AtomicOrdering::Acquire) {
2381 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2382 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2383 }
2384 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2385 Changed = true;
2386 }
2387
2388 if (STORECnt) {
2389 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2390 Changed = true;
2391 }
2392
2393 if (DSCnt) {
2394 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2395 Changed = true;
2396 }
2397
2398 if (Pos == Position::AFTER)
2399 --MI;
2400
2401 return Changed;
2402}
2403
2404bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2405 SIAtomicScope Scope,
2406 SIAtomicAddrSpace AddrSpace,
2407 Position Pos) const {
2408 if (!InsertCacheInv)
2409 return false;
2410
2411 MachineBasicBlock &MBB = *MI->getParent();
2412 DebugLoc DL = MI->getDebugLoc();
2413
2414 /// The scratch address space does not need the global memory cache
2415 /// to be flushed as all memory operations by the same thread are
2416 /// sequentially consistent, and no other thread can access scratch
2417 /// memory.
2418
2419 /// Other address spaces do not have a cache.
2420 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2421 return false;
2422
2424 switch (Scope) {
2425 case SIAtomicScope::SYSTEM:
2426 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2427 break;
2428 case SIAtomicScope::AGENT:
2429 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2430 break;
2431 case SIAtomicScope::WORKGROUP:
2432 // In WGP mode the waves of a work-group can be executing on either CU of
2433 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2434 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2435 // the L0 does not need to be invalidated.
2436 if (ST.isCuModeEnabled())
2437 return false;
2438
2439 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2440 break;
2441 case SIAtomicScope::WAVEFRONT:
2442 case SIAtomicScope::SINGLETHREAD:
2443 // No cache to invalidate.
2444 return false;
2445 default:
2446 llvm_unreachable("Unsupported synchronization scope");
2447 }
2448
2449 if (Pos == Position::AFTER)
2450 ++MI;
2451
2452 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2453
2454 if (Pos == Position::AFTER)
2455 --MI;
2456
2457 return true;
2458}
2459
2460bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2461 SIAtomicScope Scope,
2462 SIAtomicAddrSpace AddrSpace,
2463 bool IsCrossAddrSpaceOrdering,
2464 Position Pos) const {
2465 MachineBasicBlock &MBB = *MI->getParent();
2466 DebugLoc DL = MI->getDebugLoc();
2467
2468 // The scratch address space does not need the global memory cache
2469 // writeback as all memory operations by the same thread are
2470 // sequentially consistent, and no other thread can access scratch
2471 // memory.
2472
2473 // Other address spaces do not have a cache.
2474 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2475 return false;
2476
2477 if (Pos == Position::AFTER)
2478 ++MI;
2479
2480 // global_wb is only necessary at system scope for gfx120x targets.
2481 //
2482 // Emitting it for lower scopes is a slow no-op, so we omit it
2483 // for performance.
2484 switch (Scope) {
2485 case SIAtomicScope::SYSTEM:
2486 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2488 break;
2489 case SIAtomicScope::AGENT:
2490 case SIAtomicScope::WORKGROUP:
2491 // No WB necessary, but we still have to wait.
2492 break;
2493 case SIAtomicScope::WAVEFRONT:
2494 case SIAtomicScope::SINGLETHREAD:
2495 // No WB or wait necessary here.
2496 return false;
2497 default:
2498 llvm_unreachable("Unsupported synchronization scope");
2499 }
2500
2501 if (Pos == Position::AFTER)
2502 --MI;
2503
2504 // We always have to wait for previous memory operations (load/store) to
2505 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2506 // we of course need to wait for that as well.
2507 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2508 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2509
2510 return true;
2511}
2512
2513bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2514 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2515 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2516
2517 // Only handle load and store, not atomic read-modify-write instructions.
2518 assert(MI->mayLoad() ^ MI->mayStore());
2519
2520 // Only update load and store, not LLVM IR atomic read-modify-write
2521 // instructions. The latter are always marked as volatile so cannot sensibly
2522 // handle it as do not want to pessimize all atomics. Also they do not support
2523 // the nontemporal attribute.
2524 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2525
2526 bool Changed = false;
2527
2528 if (IsLastUse) {
2529 // Set last-use hint.
2530 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2531 } else if (IsNonTemporal) {
2532 // Set non-temporal hint for all cache levels.
2533 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2534 }
2535
2536 if (IsVolatile) {
2537 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2538
2539 if (Op == SIMemOp::STORE)
2540 Changed |= insertWaitsBeforeSystemScopeStore(MI);
2541
2542 // Ensure operation has completed at system scope to cause all volatile
2543 // operations to be visible outside the program in a global order. Do not
2544 // request cross address space as only the global address space can be
2545 // observable outside the program, so no need to cause a waitcnt for LDS
2546 // address space operations.
2547 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2548 Position::AFTER, AtomicOrdering::Unordered);
2549 }
2550
2551 return Changed;
2552}
2553
2554bool SIGfx12CacheControl::expandSystemScopeStore(
2556 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2557 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2558 return insertWaitsBeforeSystemScopeStore(MI);
2559
2560 return false;
2561}
2562
2563bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2564 SIAtomicScope Scope,
2565 SIAtomicAddrSpace AddrSpace) const {
2566 bool Changed = false;
2567
2568 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2569 switch (Scope) {
2570 case SIAtomicScope::SYSTEM:
2571 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2572 break;
2573 case SIAtomicScope::AGENT:
2574 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2575 break;
2576 case SIAtomicScope::WORKGROUP:
2577 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2578 // different CUs that access different L0s.
2579 if (!ST.isCuModeEnabled())
2580 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2581 break;
2582 case SIAtomicScope::WAVEFRONT:
2583 case SIAtomicScope::SINGLETHREAD:
2584 // No cache to bypass.
2585 break;
2586 default:
2587 llvm_unreachable("Unsupported synchronization scope");
2588 }
2589 }
2590
2591 // The scratch address space does not need the global memory caches
2592 // to be bypassed as all memory operations by the same thread are
2593 // sequentially consistent, and no other thread can access scratch
2594 // memory.
2595
2596 // Other address spaces do not have a cache.
2597
2598 return Changed;
2599}
2600
2601bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2602 if (AtomicPseudoMIs.empty())
2603 return false;
2604
2605 for (auto &MI : AtomicPseudoMIs)
2606 MI->eraseFromParent();
2607
2608 AtomicPseudoMIs.clear();
2609 return true;
2610}
2611
2612bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2614 assert(MI->mayLoad() && !MI->mayStore());
2615
2616 bool Changed = false;
2617
2618 if (MOI.isAtomic()) {
2619 const AtomicOrdering Order = MOI.getOrdering();
2620 if (Order == AtomicOrdering::Monotonic ||
2621 Order == AtomicOrdering::Acquire ||
2622 Order == AtomicOrdering::SequentiallyConsistent) {
2623 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2624 MOI.getOrderingAddrSpace());
2625 }
2626
2627 if (Order == AtomicOrdering::SequentiallyConsistent)
2628 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2629 SIMemOp::LOAD | SIMemOp::STORE,
2630 MOI.getIsCrossAddressSpaceOrdering(),
2631 Position::BEFORE, Order);
2632
2633 if (Order == AtomicOrdering::Acquire ||
2634 Order == AtomicOrdering::SequentiallyConsistent) {
2635 Changed |= CC->insertWait(
2636 MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2637 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2638 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2639 MOI.getOrderingAddrSpace(),
2640 Position::AFTER);
2641 }
2642
2643 return Changed;
2644 }
2645
2646 // Atomic instructions already bypass caches to the scope specified by the
2647 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2648 // instructions need additional treatment.
2649 Changed |= CC->enableVolatileAndOrNonTemporal(
2650 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2651 MOI.isNonTemporal(), MOI.isLastUse());
2652
2653 return Changed;
2654}
2655
2656bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2658 assert(!MI->mayLoad() && MI->mayStore());
2659
2660 bool Changed = false;
2661
2662 if (MOI.isAtomic()) {
2663 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2664 MOI.getOrdering() == AtomicOrdering::Release ||
2665 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2666 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2667 MOI.getOrderingAddrSpace());
2668 }
2669
2670 if (MOI.getOrdering() == AtomicOrdering::Release ||
2671 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2672 Changed |= CC->insertRelease(MI, MOI.getScope(),
2673 MOI.getOrderingAddrSpace(),
2674 MOI.getIsCrossAddressSpaceOrdering(),
2675 Position::BEFORE);
2676
2677 return Changed;
2678 }
2679
2680 // Atomic instructions already bypass caches to the scope specified by the
2681 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2682 // need additional treatment.
2683 Changed |= CC->enableVolatileAndOrNonTemporal(
2684 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2685 MOI.isNonTemporal());
2686
2687 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2688 // instruction field, do not confuse it with atomic scope.
2689 Changed |= CC->expandSystemScopeStore(MI);
2690 return Changed;
2691}
2692
2693bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2695 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2696
2697 AtomicPseudoMIs.push_back(MI);
2698 bool Changed = false;
2699
2700 // Refine fenced address space based on MMRAs.
2701 //
2702 // TODO: Should we support this MMRA on other atomic operations?
2703 auto OrderingAddrSpace =
2704 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2705
2706 if (MOI.isAtomic()) {
2707 const AtomicOrdering Order = MOI.getOrdering();
2708 if (Order == AtomicOrdering::Acquire) {
2709 Changed |= CC->insertWait(
2710 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2711 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2712 }
2713
2714 if (Order == AtomicOrdering::Release ||
2715 Order == AtomicOrdering::AcquireRelease ||
2716 Order == AtomicOrdering::SequentiallyConsistent)
2717 /// TODO: This relies on a barrier always generating a waitcnt
2718 /// for LDS to ensure it is not reordered with the completion of
2719 /// the proceeding LDS operations. If barrier had a memory
2720 /// ordering and memory scope, then library does not need to
2721 /// generate a fence. Could add support in this file for
2722 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2723 /// adding S_WAITCNT before a S_BARRIER.
2724 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2725 MOI.getIsCrossAddressSpaceOrdering(),
2726 Position::BEFORE);
2727
2728 // TODO: If both release and invalidate are happening they could be combined
2729 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2730 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2731 // track cache invalidate and write back instructions.
2732
2733 if (Order == AtomicOrdering::Acquire ||
2734 Order == AtomicOrdering::AcquireRelease ||
2735 Order == AtomicOrdering::SequentiallyConsistent)
2736 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2737 Position::BEFORE);
2738
2739 return Changed;
2740 }
2741
2742 return Changed;
2743}
2744
2745bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2747 assert(MI->mayLoad() && MI->mayStore());
2748
2749 bool Changed = false;
2750
2751 if (MOI.isAtomic()) {
2752 const AtomicOrdering Order = MOI.getOrdering();
2753 if (Order == AtomicOrdering::Monotonic ||
2754 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2755 Order == AtomicOrdering::AcquireRelease ||
2756 Order == AtomicOrdering::SequentiallyConsistent) {
2757 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2758 MOI.getInstrAddrSpace());
2759 }
2760
2761 if (Order == AtomicOrdering::Release ||
2762 Order == AtomicOrdering::AcquireRelease ||
2763 Order == AtomicOrdering::SequentiallyConsistent ||
2764 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2765 Changed |= CC->insertRelease(MI, MOI.getScope(),
2766 MOI.getOrderingAddrSpace(),
2767 MOI.getIsCrossAddressSpaceOrdering(),
2768 Position::BEFORE);
2769
2770 if (Order == AtomicOrdering::Acquire ||
2771 Order == AtomicOrdering::AcquireRelease ||
2772 Order == AtomicOrdering::SequentiallyConsistent ||
2773 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2774 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2775 Changed |= CC->insertWait(
2776 MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2777 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2778 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2779 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2780 MOI.getOrderingAddrSpace(),
2781 Position::AFTER);
2782 }
2783
2784 return Changed;
2785 }
2786
2787 return Changed;
2788}
2789
2790bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2791 bool Changed = false;
2792
2793 const MachineModuleInfo &MMI =
2794 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2795
2796 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2797 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2798
2799 for (auto &MBB : MF) {
2800 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2801
2802 // Unbundle instructions after the post-RA scheduler.
2803 if (MI->isBundle() && MI->mayLoadOrStore()) {
2804 MachineBasicBlock::instr_iterator II(MI->getIterator());
2806 I != E && I->isBundledWithPred(); ++I) {
2807 I->unbundleFromPred();
2808 for (MachineOperand &MO : I->operands())
2809 if (MO.isReg())
2810 MO.setIsInternalRead(false);
2811 }
2812
2813 MI->eraseFromParent();
2814 MI = II->getIterator();
2815 }
2816
2817 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2818 continue;
2819
2820 if (const auto &MOI = MOA.getLoadInfo(MI))
2821 Changed |= expandLoad(*MOI, MI);
2822 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2823 Changed |= expandStore(*MOI, MI);
2824 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2825 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2826 Changed |= expandAtomicFence(*MOI, MI);
2827 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2828 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2829 }
2830 }
2831
2832 Changed |= removeAtomicPseudoMIs();
2833 return Changed;
2834}
2835
2836INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2837
2838char SIMemoryLegalizer::ID = 0;
2839char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2840
2842 return new SIMemoryLegalizer();
2843}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
@ Default
Definition: DwarfDebug.cpp:87
std::string Name
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
#define DEBUG_TYPE
raw_pwrite_stream & OS
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
This class contains meta information specific to a module.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:679
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
LLVM Value Representation.
Definition: Value.h:74
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
@ NONE
Definition: Attributor.h:6475
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ DS_Warning
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
FunctionPass * createSIMemoryLegalizerPass()
Instruction set architecture version.
Definition: TargetParser.h:130