LLVM 19.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
28
29using namespace llvm;
30using namespace llvm::AMDGPU;
31
32#define DEBUG_TYPE "si-memory-legalizer"
33#define PASS_NAME "SI Memory Legalizer"
34
36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37 cl::desc("Use this to skip inserting cache invalidating instructions."));
38
39namespace {
40
42
43/// Memory operation flags. Can be ORed together.
44enum class SIMemOp {
45 NONE = 0u,
46 LOAD = 1u << 0,
47 STORE = 1u << 1,
48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49};
50
51/// Position to insert a new instruction relative to an existing
52/// instruction.
53enum class Position {
54 BEFORE,
55 AFTER
56};
57
58/// The atomic synchronization scopes supported by the AMDGPU target.
59enum class SIAtomicScope {
60 NONE,
61 SINGLETHREAD,
62 WAVEFRONT,
64 AGENT,
65 SYSTEM
66};
67
68/// The distinct address spaces supported by the AMDGPU target for
69/// atomic memory operation. Can be ORed together.
70enum class SIAtomicAddrSpace {
71 NONE = 0u,
72 GLOBAL = 1u << 0,
73 LDS = 1u << 1,
74 SCRATCH = 1u << 2,
75 GDS = 1u << 3,
76 OTHER = 1u << 4,
77
78 /// The address spaces that can be accessed by a FLAT instruction.
79 FLAT = GLOBAL | LDS | SCRATCH,
80
81 /// The address spaces that support atomic instructions.
82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83
84 /// All address spaces.
85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86
87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88};
89
90class SIMemOpInfo final {
91private:
92
93 friend class SIMemOpAccess;
94
95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100 bool IsCrossAddressSpaceOrdering = false;
101 bool IsVolatile = false;
102 bool IsNonTemporal = false;
103 bool IsLastUse = false;
104
105 SIMemOpInfo(
106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110 bool IsCrossAddressSpaceOrdering = true,
111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112 bool IsVolatile = false, bool IsNonTemporal = false,
113 bool IsLastUse = false)
114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118 IsLastUse(IsLastUse) {
119
120 if (Ordering == AtomicOrdering::NotAtomic) {
121 assert(Scope == SIAtomicScope::NONE &&
122 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123 !IsCrossAddressSpaceOrdering &&
124 FailureOrdering == AtomicOrdering::NotAtomic);
125 return;
126 }
127
128 assert(Scope != SIAtomicScope::NONE &&
129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE &&
131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132 SIAtomicAddrSpace::NONE);
133
134 // There is also no cross address space ordering if the ordering
135 // address space is the same as the instruction address space and
136 // only contains a single address space.
137 if ((OrderingAddrSpace == InstrAddrSpace) &&
138 isPowerOf2_32(uint32_t(InstrAddrSpace)))
139 this->IsCrossAddressSpaceOrdering = false;
140
141 // Limit the scope to the maximum supported by the instruction's address
142 // spaces.
143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144 SIAtomicAddrSpace::NONE) {
145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146 } else if ((InstrAddrSpace &
147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148 SIAtomicAddrSpace::NONE) {
149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150 } else if ((InstrAddrSpace &
151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154 }
155 }
156
157public:
158 /// \returns Atomic synchronization scope of the machine instruction used to
159 /// create this SIMemOpInfo.
160 SIAtomicScope getScope() const {
161 return Scope;
162 }
163
164 /// \returns Ordering constraint of the machine instruction used to
165 /// create this SIMemOpInfo.
166 AtomicOrdering getOrdering() const {
167 return Ordering;
168 }
169
170 /// \returns Failure ordering constraint of the machine instruction used to
171 /// create this SIMemOpInfo.
172 AtomicOrdering getFailureOrdering() const {
173 return FailureOrdering;
174 }
175
176 /// \returns The address spaces be accessed by the machine
177 /// instruction used to create this SIMemOpInfo.
178 SIAtomicAddrSpace getInstrAddrSpace() const {
179 return InstrAddrSpace;
180 }
181
182 /// \returns The address spaces that must be ordered by the machine
183 /// instruction used to create this SIMemOpInfo.
184 SIAtomicAddrSpace getOrderingAddrSpace() const {
185 return OrderingAddrSpace;
186 }
187
188 /// \returns Return true iff memory ordering of operations on
189 /// different address spaces is required.
190 bool getIsCrossAddressSpaceOrdering() const {
191 return IsCrossAddressSpaceOrdering;
192 }
193
194 /// \returns True if memory access of the machine instruction used to
195 /// create this SIMemOpInfo is volatile, false otherwise.
196 bool isVolatile() const {
197 return IsVolatile;
198 }
199
200 /// \returns True if memory access of the machine instruction used to
201 /// create this SIMemOpInfo is nontemporal, false otherwise.
202 bool isNonTemporal() const {
203 return IsNonTemporal;
204 }
205
206 /// \returns True if memory access of the machine instruction used to
207 /// create this SIMemOpInfo is last use, false otherwise.
208 bool isLastUse() const { return IsLastUse; }
209
210 /// \returns True if ordering constraint of the machine instruction used to
211 /// create this SIMemOpInfo is unordered or higher, false otherwise.
212 bool isAtomic() const {
213 return Ordering != AtomicOrdering::NotAtomic;
214 }
215
216};
217
218class SIMemOpAccess final {
219private:
220 AMDGPUMachineModuleInfo *MMI = nullptr;
221
222 /// Reports unsupported message \p Msg for \p MI to LLVM context.
223 void reportUnsupported(const MachineBasicBlock::iterator &MI,
224 const char *Msg) const;
225
226 /// Inspects the target synchronization scope \p SSID and determines
227 /// the SI atomic scope it corresponds to, the address spaces it
228 /// covers, and whether the memory ordering applies between address
229 /// spaces.
230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232
233 /// \return Return a bit set of the address spaces accessed by \p AS.
234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235
236 /// \returns Info constructed from \p MI, which has at least machine memory
237 /// operand.
238 std::optional<SIMemOpInfo>
239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240
241public:
242 /// Construct class to support accessing the machine memory operands
243 /// of instructions in the machine function \p MF.
244 SIMemOpAccess(MachineFunction &MF);
245
246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247 std::optional<SIMemOpInfo>
249
250 /// \returns Store info if \p MI is a store operation, "std::nullopt"
251 /// otherwise.
252 std::optional<SIMemOpInfo>
253 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254
255 /// \returns Atomic fence info if \p MI is an atomic fence operation,
256 /// "std::nullopt" otherwise.
257 std::optional<SIMemOpInfo>
258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259
260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261 /// rmw operation, "std::nullopt" otherwise.
262 std::optional<SIMemOpInfo>
263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264};
265
266class SICacheControl {
267protected:
268
269 /// AMDGPU subtarget info.
270 const GCNSubtarget &ST;
271
272 /// Instruction info.
273 const SIInstrInfo *TII = nullptr;
274
275 IsaVersion IV;
276
277 /// Whether to insert cache invalidating instructions.
278 bool InsertCacheInv;
279
280 SICacheControl(const GCNSubtarget &ST);
281
282 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283 /// \returns Returns true if \p MI is modified, false otherwise.
284 bool enableNamedBit(const MachineBasicBlock::iterator MI,
285 AMDGPU::CPol::CPol Bit) const;
286
287public:
288
289 /// Create a cache control for the subtarget \p ST.
290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291
292 /// Update \p MI memory load instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory store instruction to bypass any caches up to
300 /// the \p Scope memory scope for address spaces \p
301 /// AddrSpace. Return true iff the instruction was modified.
302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory read-modify-write instruction to bypass any caches up
307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308 /// iff the instruction was modified.
309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310 SIAtomicScope Scope,
311 SIAtomicAddrSpace AddrSpace) const = 0;
312
313 /// Update \p MI memory instruction of kind \p Op associated with address
314 /// spaces \p AddrSpace to indicate it is volatile and/or
315 /// nontemporal/last-use. Return true iff the instruction was modified.
316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317 SIAtomicAddrSpace AddrSpace,
318 SIMemOp Op, bool IsVolatile,
319 bool IsNonTemporal,
320 bool IsLastUse = false) const = 0;
321
322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323 return false;
324 };
325
326 /// Inserts any necessary instructions at position \p Pos relative
327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328 /// \p Op associated with address spaces \p AddrSpace have completed. Used
329 /// between memory instructions to enforce the order they become visible as
330 /// observed by other memory instructions executing in memory scope \p Scope.
331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332 /// address spaces. Returns true iff any instructions inserted.
333 virtual bool insertWait(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 SIMemOp Op,
337 bool IsCrossAddrSpaceOrdering,
338 Position Pos) const = 0;
339
340 /// Inserts any necessary instructions at position \p Pos relative to
341 /// instruction \p MI to ensure any subsequent memory instructions of this
342 /// thread with address spaces \p AddrSpace will observe the previous memory
343 /// operations by any thread for memory scopes up to memory scope \p Scope .
344 /// Returns true iff any instructions inserted.
345 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 Position Pos) const = 0;
349
350 /// Inserts any necessary instructions at position \p Pos relative to
351 /// instruction \p MI to ensure previous memory instructions by this thread
352 /// with address spaces \p AddrSpace have completed and can be observed by
353 /// subsequent memory instructions by any thread executing in memory scope \p
354 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355 /// between address spaces. Returns true iff any instructions inserted.
356 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357 SIAtomicScope Scope,
358 SIAtomicAddrSpace AddrSpace,
359 bool IsCrossAddrSpaceOrdering,
360 Position Pos) const = 0;
361
362 /// Virtual destructor to allow derivations to be deleted.
363 virtual ~SICacheControl() = default;
364
365 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
367 return false;
368 }
369};
370
371class SIGfx6CacheControl : public SICacheControl {
372protected:
373
374 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375 /// is modified, false otherwise.
376 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377 return enableNamedBit(MI, AMDGPU::CPol::GLC);
378 }
379
380 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381 /// is modified, false otherwise.
382 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383 return enableNamedBit(MI, AMDGPU::CPol::SLC);
384 }
385
386public:
387
388 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
389
390 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391 SIAtomicScope Scope,
392 SIAtomicAddrSpace AddrSpace) const override;
393
394 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395 SIAtomicScope Scope,
396 SIAtomicAddrSpace AddrSpace) const override;
397
398 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399 SIAtomicScope Scope,
400 SIAtomicAddrSpace AddrSpace) const override;
401
402 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404 bool IsVolatile, bool IsNonTemporal,
405 bool IsLastUse) const override;
406
407 bool insertWait(MachineBasicBlock::iterator &MI,
408 SIAtomicScope Scope,
409 SIAtomicAddrSpace AddrSpace,
410 SIMemOp Op,
411 bool IsCrossAddrSpaceOrdering,
412 Position Pos) const override;
413
414 bool insertAcquire(MachineBasicBlock::iterator &MI,
415 SIAtomicScope Scope,
416 SIAtomicAddrSpace AddrSpace,
417 Position Pos) const override;
418
419 bool insertRelease(MachineBasicBlock::iterator &MI,
420 SIAtomicScope Scope,
421 SIAtomicAddrSpace AddrSpace,
422 bool IsCrossAddrSpaceOrdering,
423 Position Pos) const override;
424};
425
426class SIGfx7CacheControl : public SIGfx6CacheControl {
427public:
428
429 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
430
431 bool insertAcquire(MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace,
434 Position Pos) const override;
435
436};
437
438class SIGfx90ACacheControl : public SIGfx7CacheControl {
439public:
440
441 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442
443 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444 SIAtomicScope Scope,
445 SIAtomicAddrSpace AddrSpace) const override;
446
447 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448 SIAtomicScope Scope,
449 SIAtomicAddrSpace AddrSpace) const override;
450
451 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace) const override;
454
455 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457 bool IsVolatile, bool IsNonTemporal,
458 bool IsLastUse) const override;
459
460 bool insertWait(MachineBasicBlock::iterator &MI,
461 SIAtomicScope Scope,
462 SIAtomicAddrSpace AddrSpace,
463 SIMemOp Op,
464 bool IsCrossAddrSpaceOrdering,
465 Position Pos) const override;
466
467 bool insertAcquire(MachineBasicBlock::iterator &MI,
468 SIAtomicScope Scope,
469 SIAtomicAddrSpace AddrSpace,
470 Position Pos) const override;
471
472 bool insertRelease(MachineBasicBlock::iterator &MI,
473 SIAtomicScope Scope,
474 SIAtomicAddrSpace AddrSpace,
475 bool IsCrossAddrSpaceOrdering,
476 Position Pos) const override;
477};
478
479class SIGfx940CacheControl : public SIGfx90ACacheControl {
480protected:
481
482 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483 /// is modified, false otherwise.
484 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485 return enableNamedBit(MI, AMDGPU::CPol::SC0);
486 }
487
488 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489 /// is modified, false otherwise.
490 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491 return enableNamedBit(MI, AMDGPU::CPol::SC1);
492 }
493
494 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495 /// is modified, false otherwise.
496 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497 return enableNamedBit(MI, AMDGPU::CPol::NT);
498 }
499
500public:
501
502 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
503
504 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505 SIAtomicScope Scope,
506 SIAtomicAddrSpace AddrSpace) const override;
507
508 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509 SIAtomicScope Scope,
510 SIAtomicAddrSpace AddrSpace) const override;
511
512 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513 SIAtomicScope Scope,
514 SIAtomicAddrSpace AddrSpace) const override;
515
516 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518 bool IsVolatile, bool IsNonTemporal,
519 bool IsLastUse) const override;
520
521 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523
524 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526 Position Pos) const override;
527
528 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529 MachineBasicBlock::iterator &MI) const override {
530 bool Changed = false;
531 if (ST.hasForceStoreSC0SC1() &&
532 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
533 SIAtomicAddrSpace::GLOBAL |
534 SIAtomicAddrSpace::OTHER)) !=
535 SIAtomicAddrSpace::NONE) {
536 Changed |= enableSC0Bit(MI);
537 Changed |= enableSC1Bit(MI);
538 }
539 return Changed;
540 }
541};
542
543class SIGfx10CacheControl : public SIGfx7CacheControl {
544protected:
545
546 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547 /// is modified, false otherwise.
548 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549 return enableNamedBit(MI, AMDGPU::CPol::DLC);
550 }
551
552public:
553
554 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
555
556 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557 SIAtomicScope Scope,
558 SIAtomicAddrSpace AddrSpace) const override;
559
560 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562 bool IsVolatile, bool IsNonTemporal,
563 bool IsLastUse) const override;
564
565 bool insertWait(MachineBasicBlock::iterator &MI,
566 SIAtomicScope Scope,
567 SIAtomicAddrSpace AddrSpace,
568 SIMemOp Op,
569 bool IsCrossAddrSpaceOrdering,
570 Position Pos) const override;
571
572 bool insertAcquire(MachineBasicBlock::iterator &MI,
573 SIAtomicScope Scope,
574 SIAtomicAddrSpace AddrSpace,
575 Position Pos) const override;
576};
577
578class SIGfx11CacheControl : public SIGfx10CacheControl {
579public:
580 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
581
582 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583 SIAtomicScope Scope,
584 SIAtomicAddrSpace AddrSpace) const override;
585
586 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588 bool IsVolatile, bool IsNonTemporal,
589 bool IsLastUse) const override;
590};
591
592class SIGfx12CacheControl : public SIGfx11CacheControl {
593protected:
594 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595 // \returns Returns true if \p MI is modified, false otherwise.
596 bool setTH(const MachineBasicBlock::iterator MI,
598 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
599 // MI. \returns Returns true if \p MI is modified, false otherwise.
600 bool setScope(const MachineBasicBlock::iterator MI,
602
603 // Stores with system scope (SCOPE_SYS) need to wait for:
604 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605 // - non-returning-atomics - wait for STORECNT==0
606 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607 // since it does not distinguish atomics-with-return from regular stores.
608 // There is no need to wait if memory is cached (mtype != UC).
609 bool
610 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611
612 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614
615public:
616 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
617
618 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
619 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620 bool IsCrossAddrSpaceOrdering, Position Pos) const override;
621
622 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
623 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
624
625 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627 bool IsVolatile, bool IsNonTemporal,
628 bool IsLastUse) const override;
629
630 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631
632 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634 Position Pos) const override;
635
636 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637 SIAtomicScope Scope,
638 SIAtomicAddrSpace AddrSpace) const override {
639 return setAtomicScope(MI, Scope, AddrSpace);
640 }
641
642 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643 SIAtomicScope Scope,
644 SIAtomicAddrSpace AddrSpace) const override {
645 return setAtomicScope(MI, Scope, AddrSpace);
646 }
647
648 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649 SIAtomicScope Scope,
650 SIAtomicAddrSpace AddrSpace) const override {
651 return setAtomicScope(MI, Scope, AddrSpace);
652 }
653};
654
655class SIMemoryLegalizer final : public MachineFunctionPass {
656private:
657
658 /// Cache Control.
659 std::unique_ptr<SICacheControl> CC = nullptr;
660
661 /// List of atomic pseudo instructions.
662 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
663
664 /// Return true iff instruction \p MI is a atomic instruction that
665 /// returns a result.
666 bool isAtomicRet(const MachineInstr &MI) const {
668 }
669
670 /// Removes all processed atomic pseudo instructions from the current
671 /// function. Returns true if current function is modified, false otherwise.
672 bool removeAtomicPseudoMIs();
673
674 /// Expands load operation \p MI. Returns true if instructions are
675 /// added/deleted or \p MI is modified, false otherwise.
676 bool expandLoad(const SIMemOpInfo &MOI,
678 /// Expands store operation \p MI. Returns true if instructions are
679 /// added/deleted or \p MI is modified, false otherwise.
680 bool expandStore(const SIMemOpInfo &MOI,
682 /// Expands atomic fence operation \p MI. Returns true if
683 /// instructions are added/deleted or \p MI is modified, false otherwise.
684 bool expandAtomicFence(const SIMemOpInfo &MOI,
686 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
687 /// instructions are added/deleted or \p MI is modified, false otherwise.
688 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
690
691public:
692 static char ID;
693
694 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
695
696 void getAnalysisUsage(AnalysisUsage &AU) const override {
697 AU.setPreservesCFG();
699 }
700
701 StringRef getPassName() const override {
702 return PASS_NAME;
703 }
704
705 bool runOnMachineFunction(MachineFunction &MF) override;
706};
707
708static const StringMap<SIAtomicAddrSpace> ASNames = {{
709 {"global", SIAtomicAddrSpace::GLOBAL},
710 {"local", SIAtomicAddrSpace::LDS},
711}};
712
713void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714 const MachineFunction *MF = MI.getMF();
715 const Function &Fn = MF->getFunction();
718 OS << "unknown address space '" << AS << "'; expected one of ";
719 ListSeparator LS;
720 for (const auto &[Name, Val] : ASNames)
721 OS << LS << '\'' << Name << '\'';
722 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723 Fn.getContext().diagnose(BadTag);
724}
725
726/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727/// If this tag isn't present, or if it has no meaningful values, returns \p
728/// Default. Otherwise returns all the address spaces concerned by the MMRA.
729static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730 SIAtomicAddrSpace Default) {
731 static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732
733 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
734 if (!MMRA)
735 return Default;
736
737 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738 for (const auto &[Prefix, Suffix] : MMRA) {
739 if (Prefix != FenceASPrefix)
740 continue;
741
742 if (auto It = ASNames.find(Suffix); It != ASNames.end())
743 Result |= It->second;
744 else
745 diagnoseUnknownMMRAASName(MI, Suffix);
746 }
747
748 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749}
750
751} // end anonymous namespace
752
753void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
754 const char *Msg) const {
755 const Function &Func = MI->getParent()->getParent()->getFunction();
756 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
757 Func.getContext().diagnose(Diag);
758}
759
760std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
761SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762 SIAtomicAddrSpace InstrAddrSpace) const {
763 if (SSID == SyncScope::System)
764 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
765 if (SSID == MMI->getAgentSSID())
766 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
767 if (SSID == MMI->getWorkgroupSSID())
768 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
769 true);
770 if (SSID == MMI->getWavefrontSSID())
771 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
772 true);
773 if (SSID == SyncScope::SingleThread)
774 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
775 true);
776 if (SSID == MMI->getSystemOneAddressSpaceSSID())
777 return std::tuple(SIAtomicScope::SYSTEM,
778 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
779 if (SSID == MMI->getAgentOneAddressSpaceSSID())
780 return std::tuple(SIAtomicScope::AGENT,
781 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
782 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783 return std::tuple(SIAtomicScope::WORKGROUP,
784 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
785 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786 return std::tuple(SIAtomicScope::WAVEFRONT,
787 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
788 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789 return std::tuple(SIAtomicScope::SINGLETHREAD,
790 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791 return std::nullopt;
792}
793
794SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
795 if (AS == AMDGPUAS::FLAT_ADDRESS)
796 return SIAtomicAddrSpace::FLAT;
797 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
798 return SIAtomicAddrSpace::GLOBAL;
799 if (AS == AMDGPUAS::LOCAL_ADDRESS)
800 return SIAtomicAddrSpace::LDS;
802 return SIAtomicAddrSpace::SCRATCH;
803 if (AS == AMDGPUAS::REGION_ADDRESS)
804 return SIAtomicAddrSpace::GDS;
805
806 return SIAtomicAddrSpace::OTHER;
807}
808
809SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
811}
812
813std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
814 const MachineBasicBlock::iterator &MI) const {
815 assert(MI->getNumMemOperands() > 0);
816
818 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
819 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
820 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
821 bool IsNonTemporal = true;
822 bool IsVolatile = false;
823 bool IsLastUse = false;
824
825 // Validator should check whether or not MMOs cover the entire set of
826 // locations accessed by the memory instruction.
827 for (const auto &MMO : MI->memoperands()) {
828 IsNonTemporal &= MMO->isNonTemporal();
829 IsVolatile |= MMO->isVolatile();
830 IsLastUse |= MMO->getFlags() & MOLastUse;
831 InstrAddrSpace |=
832 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
833 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
834 if (OpOrdering != AtomicOrdering::NotAtomic) {
835 const auto &IsSyncScopeInclusion =
836 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
837 if (!IsSyncScopeInclusion) {
838 reportUnsupported(MI,
839 "Unsupported non-inclusive atomic synchronization scope");
840 return std::nullopt;
841 }
842
843 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
844 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
845 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
846 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
847 FailureOrdering =
848 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
849 }
850 }
851
852 SIAtomicScope Scope = SIAtomicScope::NONE;
853 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
854 bool IsCrossAddressSpaceOrdering = false;
855 if (Ordering != AtomicOrdering::NotAtomic) {
856 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
857 if (!ScopeOrNone) {
858 reportUnsupported(MI, "Unsupported atomic synchronization scope");
859 return std::nullopt;
860 }
861 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
862 *ScopeOrNone;
863 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
864 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
865 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
866 reportUnsupported(MI, "Unsupported atomic address space");
867 return std::nullopt;
868 }
869 }
870 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
871 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
872 IsNonTemporal, IsLastUse);
873}
874
875std::optional<SIMemOpInfo>
876SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
877 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
878
879 if (!(MI->mayLoad() && !MI->mayStore()))
880 return std::nullopt;
881
882 // Be conservative if there are no memory operands.
883 if (MI->getNumMemOperands() == 0)
884 return SIMemOpInfo();
885
886 return constructFromMIWithMMO(MI);
887}
888
889std::optional<SIMemOpInfo>
890SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
891 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
892
893 if (!(!MI->mayLoad() && MI->mayStore()))
894 return std::nullopt;
895
896 // Be conservative if there are no memory operands.
897 if (MI->getNumMemOperands() == 0)
898 return SIMemOpInfo();
899
900 return constructFromMIWithMMO(MI);
901}
902
903std::optional<SIMemOpInfo>
904SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
905 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
906
907 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
908 return std::nullopt;
909
910 AtomicOrdering Ordering =
911 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
912
913 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
914 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
915 if (!ScopeOrNone) {
916 reportUnsupported(MI, "Unsupported atomic synchronization scope");
917 return std::nullopt;
918 }
919
920 SIAtomicScope Scope = SIAtomicScope::NONE;
921 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
922 bool IsCrossAddressSpaceOrdering = false;
923 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
924 *ScopeOrNone;
925
926 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
927 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
928 reportUnsupported(MI, "Unsupported atomic address space");
929 return std::nullopt;
930 }
931
932 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
933 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
934}
935
936std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
937 const MachineBasicBlock::iterator &MI) const {
938 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
939
940 if (!(MI->mayLoad() && MI->mayStore()))
941 return std::nullopt;
942
943 // Be conservative if there are no memory operands.
944 if (MI->getNumMemOperands() == 0)
945 return SIMemOpInfo();
946
947 return constructFromMIWithMMO(MI);
948}
949
950SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
951 TII = ST.getInstrInfo();
952 IV = getIsaVersion(ST.getCPU());
953 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
954}
955
956bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
957 AMDGPU::CPol::CPol Bit) const {
958 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
959 if (!CPol)
960 return false;
961
962 CPol->setImm(CPol->getImm() | Bit);
963 return true;
964}
965
966/* static */
967std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
968 GCNSubtarget::Generation Generation = ST.getGeneration();
969 if (ST.hasGFX940Insts())
970 return std::make_unique<SIGfx940CacheControl>(ST);
971 if (ST.hasGFX90AInsts())
972 return std::make_unique<SIGfx90ACacheControl>(ST);
973 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
974 return std::make_unique<SIGfx6CacheControl>(ST);
975 if (Generation < AMDGPUSubtarget::GFX10)
976 return std::make_unique<SIGfx7CacheControl>(ST);
977 if (Generation < AMDGPUSubtarget::GFX11)
978 return std::make_unique<SIGfx10CacheControl>(ST);
979 if (Generation < AMDGPUSubtarget::GFX12)
980 return std::make_unique<SIGfx11CacheControl>(ST);
981 return std::make_unique<SIGfx12CacheControl>(ST);
982}
983
984bool SIGfx6CacheControl::enableLoadCacheBypass(
986 SIAtomicScope Scope,
987 SIAtomicAddrSpace AddrSpace) const {
988 assert(MI->mayLoad() && !MI->mayStore());
989 bool Changed = false;
990
991 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
992 switch (Scope) {
993 case SIAtomicScope::SYSTEM:
994 case SIAtomicScope::AGENT:
995 // Set L1 cache policy to MISS_EVICT.
996 // Note: there is no L2 cache bypass policy at the ISA level.
997 Changed |= enableGLCBit(MI);
998 break;
999 case SIAtomicScope::WORKGROUP:
1000 case SIAtomicScope::WAVEFRONT:
1001 case SIAtomicScope::SINGLETHREAD:
1002 // No cache to bypass.
1003 break;
1004 default:
1005 llvm_unreachable("Unsupported synchronization scope");
1006 }
1007 }
1008
1009 /// The scratch address space does not need the global memory caches
1010 /// to be bypassed as all memory operations by the same thread are
1011 /// sequentially consistent, and no other thread can access scratch
1012 /// memory.
1013
1014 /// Other address spaces do not have a cache.
1015
1016 return Changed;
1017}
1018
1019bool SIGfx6CacheControl::enableStoreCacheBypass(
1021 SIAtomicScope Scope,
1022 SIAtomicAddrSpace AddrSpace) const {
1023 assert(!MI->mayLoad() && MI->mayStore());
1024 bool Changed = false;
1025
1026 /// The L1 cache is write through so does not need to be bypassed. There is no
1027 /// bypass control for the L2 cache at the isa level.
1028
1029 return Changed;
1030}
1031
1032bool SIGfx6CacheControl::enableRMWCacheBypass(
1034 SIAtomicScope Scope,
1035 SIAtomicAddrSpace AddrSpace) const {
1036 assert(MI->mayLoad() && MI->mayStore());
1037 bool Changed = false;
1038
1039 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1040 /// bypassed, and the GLC bit is instead used to indicate if they are
1041 /// return or no-return.
1042 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1043
1044 return Changed;
1045}
1046
1047bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1048 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1049 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1050 // Only handle load and store, not atomic read-modify-write insructions. The
1051 // latter use glc to indicate if the atomic returns a result and so must not
1052 // be used for cache control.
1053 assert(MI->mayLoad() ^ MI->mayStore());
1054
1055 // Only update load and store, not LLVM IR atomic read-modify-write
1056 // instructions. The latter are always marked as volatile so cannot sensibly
1057 // handle it as do not want to pessimize all atomics. Also they do not support
1058 // the nontemporal attribute.
1059 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1060
1061 bool Changed = false;
1062
1063 if (IsVolatile) {
1064 // Set L1 cache policy to be MISS_EVICT for load instructions
1065 // and MISS_LRU for store instructions.
1066 // Note: there is no L2 cache bypass policy at the ISA level.
1067 if (Op == SIMemOp::LOAD)
1068 Changed |= enableGLCBit(MI);
1069
1070 // Ensure operation has completed at system scope to cause all volatile
1071 // operations to be visible outside the program in a global order. Do not
1072 // request cross address space as only the global address space can be
1073 // observable outside the program, so no need to cause a waitcnt for LDS
1074 // address space operations.
1075 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1076 Position::AFTER);
1077
1078 return Changed;
1079 }
1080
1081 if (IsNonTemporal) {
1082 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1083 // for both loads and stores, and the L2 cache policy to STREAM.
1084 Changed |= enableGLCBit(MI);
1085 Changed |= enableSLCBit(MI);
1086 return Changed;
1087 }
1088
1089 return Changed;
1090}
1091
1092bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1093 SIAtomicScope Scope,
1094 SIAtomicAddrSpace AddrSpace,
1095 SIMemOp Op,
1096 bool IsCrossAddrSpaceOrdering,
1097 Position Pos) const {
1098 bool Changed = false;
1099
1100 MachineBasicBlock &MBB = *MI->getParent();
1101 DebugLoc DL = MI->getDebugLoc();
1102
1103 if (Pos == Position::AFTER)
1104 ++MI;
1105
1106 bool VMCnt = false;
1107 bool LGKMCnt = false;
1108
1109 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1110 SIAtomicAddrSpace::NONE) {
1111 switch (Scope) {
1112 case SIAtomicScope::SYSTEM:
1113 case SIAtomicScope::AGENT:
1114 VMCnt |= true;
1115 break;
1116 case SIAtomicScope::WORKGROUP:
1117 case SIAtomicScope::WAVEFRONT:
1118 case SIAtomicScope::SINGLETHREAD:
1119 // The L1 cache keeps all memory operations in order for
1120 // wavefronts in the same work-group.
1121 break;
1122 default:
1123 llvm_unreachable("Unsupported synchronization scope");
1124 }
1125 }
1126
1127 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1128 switch (Scope) {
1129 case SIAtomicScope::SYSTEM:
1130 case SIAtomicScope::AGENT:
1131 case SIAtomicScope::WORKGROUP:
1132 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1133 // not needed as LDS operations for all waves are executed in a total
1134 // global ordering as observed by all waves. Required if also
1135 // synchronizing with global/GDS memory as LDS operations could be
1136 // reordered with respect to later global/GDS memory operations of the
1137 // same wave.
1138 LGKMCnt |= IsCrossAddrSpaceOrdering;
1139 break;
1140 case SIAtomicScope::WAVEFRONT:
1141 case SIAtomicScope::SINGLETHREAD:
1142 // The LDS keeps all memory operations in order for
1143 // the same wavefront.
1144 break;
1145 default:
1146 llvm_unreachable("Unsupported synchronization scope");
1147 }
1148 }
1149
1150 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1151 switch (Scope) {
1152 case SIAtomicScope::SYSTEM:
1153 case SIAtomicScope::AGENT:
1154 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1155 // is not needed as GDS operations for all waves are executed in a total
1156 // global ordering as observed by all waves. Required if also
1157 // synchronizing with global/LDS memory as GDS operations could be
1158 // reordered with respect to later global/LDS memory operations of the
1159 // same wave.
1160 LGKMCnt |= IsCrossAddrSpaceOrdering;
1161 break;
1162 case SIAtomicScope::WORKGROUP:
1163 case SIAtomicScope::WAVEFRONT:
1164 case SIAtomicScope::SINGLETHREAD:
1165 // The GDS keeps all memory operations in order for
1166 // the same work-group.
1167 break;
1168 default:
1169 llvm_unreachable("Unsupported synchronization scope");
1170 }
1171 }
1172
1173 if (VMCnt || LGKMCnt) {
1174 unsigned WaitCntImmediate =
1176 VMCnt ? 0 : getVmcntBitMask(IV),
1178 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1179 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1180 .addImm(WaitCntImmediate);
1181 Changed = true;
1182 }
1183
1184 if (Pos == Position::AFTER)
1185 --MI;
1186
1187 return Changed;
1188}
1189
1190bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1191 SIAtomicScope Scope,
1192 SIAtomicAddrSpace AddrSpace,
1193 Position Pos) const {
1194 if (!InsertCacheInv)
1195 return false;
1196
1197 bool Changed = false;
1198
1199 MachineBasicBlock &MBB = *MI->getParent();
1200 DebugLoc DL = MI->getDebugLoc();
1201
1202 if (Pos == Position::AFTER)
1203 ++MI;
1204
1205 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1206 switch (Scope) {
1207 case SIAtomicScope::SYSTEM:
1208 case SIAtomicScope::AGENT:
1209 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1210 Changed = true;
1211 break;
1212 case SIAtomicScope::WORKGROUP:
1213 case SIAtomicScope::WAVEFRONT:
1214 case SIAtomicScope::SINGLETHREAD:
1215 // No cache to invalidate.
1216 break;
1217 default:
1218 llvm_unreachable("Unsupported synchronization scope");
1219 }
1220 }
1221
1222 /// The scratch address space does not need the global memory cache
1223 /// to be flushed as all memory operations by the same thread are
1224 /// sequentially consistent, and no other thread can access scratch
1225 /// memory.
1226
1227 /// Other address spaces do not have a cache.
1228
1229 if (Pos == Position::AFTER)
1230 --MI;
1231
1232 return Changed;
1233}
1234
1235bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1236 SIAtomicScope Scope,
1237 SIAtomicAddrSpace AddrSpace,
1238 bool IsCrossAddrSpaceOrdering,
1239 Position Pos) const {
1240 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1241 IsCrossAddrSpaceOrdering, Pos);
1242}
1243
1244bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1245 SIAtomicScope Scope,
1246 SIAtomicAddrSpace AddrSpace,
1247 Position Pos) const {
1248 if (!InsertCacheInv)
1249 return false;
1250
1251 bool Changed = false;
1252
1253 MachineBasicBlock &MBB = *MI->getParent();
1254 DebugLoc DL = MI->getDebugLoc();
1255
1257
1258 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1259 ? AMDGPU::BUFFER_WBINVL1
1260 : AMDGPU::BUFFER_WBINVL1_VOL;
1261
1262 if (Pos == Position::AFTER)
1263 ++MI;
1264
1265 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1266 switch (Scope) {
1267 case SIAtomicScope::SYSTEM:
1268 case SIAtomicScope::AGENT:
1269 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1270 Changed = true;
1271 break;
1272 case SIAtomicScope::WORKGROUP:
1273 case SIAtomicScope::WAVEFRONT:
1274 case SIAtomicScope::SINGLETHREAD:
1275 // No cache to invalidate.
1276 break;
1277 default:
1278 llvm_unreachable("Unsupported synchronization scope");
1279 }
1280 }
1281
1282 /// The scratch address space does not need the global memory cache
1283 /// to be flushed as all memory operations by the same thread are
1284 /// sequentially consistent, and no other thread can access scratch
1285 /// memory.
1286
1287 /// Other address spaces do not have a cache.
1288
1289 if (Pos == Position::AFTER)
1290 --MI;
1291
1292 return Changed;
1293}
1294
1295bool SIGfx90ACacheControl::enableLoadCacheBypass(
1297 SIAtomicScope Scope,
1298 SIAtomicAddrSpace AddrSpace) const {
1299 assert(MI->mayLoad() && !MI->mayStore());
1300 bool Changed = false;
1301
1302 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1303 switch (Scope) {
1304 case SIAtomicScope::SYSTEM:
1305 case SIAtomicScope::AGENT:
1306 // Set the L1 cache policy to MISS_LRU.
1307 // Note: there is no L2 cache bypass policy at the ISA level.
1308 Changed |= enableGLCBit(MI);
1309 break;
1310 case SIAtomicScope::WORKGROUP:
1311 // In threadgroup split mode the waves of a work-group can be executing on
1312 // different CUs. Therefore need to bypass the L1 which is per CU.
1313 // Otherwise in non-threadgroup split mode all waves of a work-group are
1314 // on the same CU, and so the L1 does not need to be bypassed.
1315 if (ST.isTgSplitEnabled())
1316 Changed |= enableGLCBit(MI);
1317 break;
1318 case SIAtomicScope::WAVEFRONT:
1319 case SIAtomicScope::SINGLETHREAD:
1320 // No cache to bypass.
1321 break;
1322 default:
1323 llvm_unreachable("Unsupported synchronization scope");
1324 }
1325 }
1326
1327 /// The scratch address space does not need the global memory caches
1328 /// to be bypassed as all memory operations by the same thread are
1329 /// sequentially consistent, and no other thread can access scratch
1330 /// memory.
1331
1332 /// Other address spaces do not have a cache.
1333
1334 return Changed;
1335}
1336
1337bool SIGfx90ACacheControl::enableStoreCacheBypass(
1339 SIAtomicScope Scope,
1340 SIAtomicAddrSpace AddrSpace) const {
1341 assert(!MI->mayLoad() && MI->mayStore());
1342 bool Changed = false;
1343
1344 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1345 switch (Scope) {
1346 case SIAtomicScope::SYSTEM:
1347 case SIAtomicScope::AGENT:
1348 /// Do not set glc for store atomic operations as they implicitly write
1349 /// through the L1 cache.
1350 break;
1351 case SIAtomicScope::WORKGROUP:
1352 case SIAtomicScope::WAVEFRONT:
1353 case SIAtomicScope::SINGLETHREAD:
1354 // No cache to bypass. Store atomics implicitly write through the L1
1355 // cache.
1356 break;
1357 default:
1358 llvm_unreachable("Unsupported synchronization scope");
1359 }
1360 }
1361
1362 /// The scratch address space does not need the global memory caches
1363 /// to be bypassed as all memory operations by the same thread are
1364 /// sequentially consistent, and no other thread can access scratch
1365 /// memory.
1366
1367 /// Other address spaces do not have a cache.
1368
1369 return Changed;
1370}
1371
1372bool SIGfx90ACacheControl::enableRMWCacheBypass(
1374 SIAtomicScope Scope,
1375 SIAtomicAddrSpace AddrSpace) const {
1376 assert(MI->mayLoad() && MI->mayStore());
1377 bool Changed = false;
1378
1379 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1380 switch (Scope) {
1381 case SIAtomicScope::SYSTEM:
1382 case SIAtomicScope::AGENT:
1383 /// Do not set glc for RMW atomic operations as they implicitly bypass
1384 /// the L1 cache, and the glc bit is instead used to indicate if they are
1385 /// return or no-return.
1386 break;
1387 case SIAtomicScope::WORKGROUP:
1388 case SIAtomicScope::WAVEFRONT:
1389 case SIAtomicScope::SINGLETHREAD:
1390 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1391 break;
1392 default:
1393 llvm_unreachable("Unsupported synchronization scope");
1394 }
1395 }
1396
1397 return Changed;
1398}
1399
1400bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1401 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1402 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1403 // Only handle load and store, not atomic read-modify-write insructions. The
1404 // latter use glc to indicate if the atomic returns a result and so must not
1405 // be used for cache control.
1406 assert(MI->mayLoad() ^ MI->mayStore());
1407
1408 // Only update load and store, not LLVM IR atomic read-modify-write
1409 // instructions. The latter are always marked as volatile so cannot sensibly
1410 // handle it as do not want to pessimize all atomics. Also they do not support
1411 // the nontemporal attribute.
1412 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1413
1414 bool Changed = false;
1415
1416 if (IsVolatile) {
1417 // Set L1 cache policy to be MISS_EVICT for load instructions
1418 // and MISS_LRU for store instructions.
1419 // Note: there is no L2 cache bypass policy at the ISA level.
1420 if (Op == SIMemOp::LOAD)
1421 Changed |= enableGLCBit(MI);
1422
1423 // Ensure operation has completed at system scope to cause all volatile
1424 // operations to be visible outside the program in a global order. Do not
1425 // request cross address space as only the global address space can be
1426 // observable outside the program, so no need to cause a waitcnt for LDS
1427 // address space operations.
1428 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1429 Position::AFTER);
1430
1431 return Changed;
1432 }
1433
1434 if (IsNonTemporal) {
1435 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1436 // for both loads and stores, and the L2 cache policy to STREAM.
1437 Changed |= enableGLCBit(MI);
1438 Changed |= enableSLCBit(MI);
1439 return Changed;
1440 }
1441
1442 return Changed;
1443}
1444
1445bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1446 SIAtomicScope Scope,
1447 SIAtomicAddrSpace AddrSpace,
1448 SIMemOp Op,
1449 bool IsCrossAddrSpaceOrdering,
1450 Position Pos) const {
1451 if (ST.isTgSplitEnabled()) {
1452 // In threadgroup split mode the waves of a work-group can be executing on
1453 // different CUs. Therefore need to wait for global or GDS memory operations
1454 // to complete to ensure they are visible to waves in the other CUs.
1455 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1456 // the same CU, so no need to wait for global memory as all waves in the
1457 // work-group access the same the L1, nor wait for GDS as access are ordered
1458 // on a CU.
1459 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1460 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1461 (Scope == SIAtomicScope::WORKGROUP)) {
1462 // Same as GFX7 using agent scope.
1463 Scope = SIAtomicScope::AGENT;
1464 }
1465 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1466 // LDS memory operations.
1467 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1468 }
1469 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1470 IsCrossAddrSpaceOrdering, Pos);
1471}
1472
1473bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1474 SIAtomicScope Scope,
1475 SIAtomicAddrSpace AddrSpace,
1476 Position Pos) const {
1477 if (!InsertCacheInv)
1478 return false;
1479
1480 bool Changed = false;
1481
1482 MachineBasicBlock &MBB = *MI->getParent();
1483 DebugLoc DL = MI->getDebugLoc();
1484
1485 if (Pos == Position::AFTER)
1486 ++MI;
1487
1488 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1489 switch (Scope) {
1490 case SIAtomicScope::SYSTEM:
1491 // Ensures that following loads will not see stale remote VMEM data or
1492 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1493 // CC will never be stale due to the local memory probes.
1494 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1495 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1496 // hardware does not reorder memory operations by the same wave with
1497 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1498 // remove any cache lines of earlier writes by the same wave and ensures
1499 // later reads by the same wave will refetch the cache lines.
1500 Changed = true;
1501 break;
1502 case SIAtomicScope::AGENT:
1503 // Same as GFX7.
1504 break;
1505 case SIAtomicScope::WORKGROUP:
1506 // In threadgroup split mode the waves of a work-group can be executing on
1507 // different CUs. Therefore need to invalidate the L1 which is per CU.
1508 // Otherwise in non-threadgroup split mode all waves of a work-group are
1509 // on the same CU, and so the L1 does not need to be invalidated.
1510 if (ST.isTgSplitEnabled()) {
1511 // Same as GFX7 using agent scope.
1512 Scope = SIAtomicScope::AGENT;
1513 }
1514 break;
1515 case SIAtomicScope::WAVEFRONT:
1516 case SIAtomicScope::SINGLETHREAD:
1517 // Same as GFX7.
1518 break;
1519 default:
1520 llvm_unreachable("Unsupported synchronization scope");
1521 }
1522 }
1523
1524 /// The scratch address space does not need the global memory cache
1525 /// to be flushed as all memory operations by the same thread are
1526 /// sequentially consistent, and no other thread can access scratch
1527 /// memory.
1528
1529 /// Other address spaces do not have a cache.
1530
1531 if (Pos == Position::AFTER)
1532 --MI;
1533
1534 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1535
1536 return Changed;
1537}
1538
1539bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1540 SIAtomicScope Scope,
1541 SIAtomicAddrSpace AddrSpace,
1542 bool IsCrossAddrSpaceOrdering,
1543 Position Pos) const {
1544 bool Changed = false;
1545
1546 MachineBasicBlock &MBB = *MI->getParent();
1547 const DebugLoc &DL = MI->getDebugLoc();
1548
1549 if (Pos == Position::AFTER)
1550 ++MI;
1551
1552 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1553 switch (Scope) {
1554 case SIAtomicScope::SYSTEM:
1555 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1556 // hardware does not reorder memory operations by the same wave with
1557 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1558 // to initiate writeback of any dirty cache lines of earlier writes by the
1559 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1560 // writeback has completed.
1561 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1562 // Set SC bits to indicate system scope.
1564 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1565 // vmcnt(0)" needed by the "BUFFER_WBL2".
1566 Changed = true;
1567 break;
1568 case SIAtomicScope::AGENT:
1569 case SIAtomicScope::WORKGROUP:
1570 case SIAtomicScope::WAVEFRONT:
1571 case SIAtomicScope::SINGLETHREAD:
1572 // Same as GFX7.
1573 break;
1574 default:
1575 llvm_unreachable("Unsupported synchronization scope");
1576 }
1577 }
1578
1579 if (Pos == Position::AFTER)
1580 --MI;
1581
1582 Changed |=
1583 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1584 IsCrossAddrSpaceOrdering, Pos);
1585
1586 return Changed;
1587}
1588
1589bool SIGfx940CacheControl::enableLoadCacheBypass(
1590 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1591 SIAtomicAddrSpace AddrSpace) const {
1592 assert(MI->mayLoad() && !MI->mayStore());
1593 bool Changed = false;
1594
1595 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1596 switch (Scope) {
1597 case SIAtomicScope::SYSTEM:
1598 // Set SC bits to indicate system scope.
1599 Changed |= enableSC0Bit(MI);
1600 Changed |= enableSC1Bit(MI);
1601 break;
1602 case SIAtomicScope::AGENT:
1603 // Set SC bits to indicate agent scope.
1604 Changed |= enableSC1Bit(MI);
1605 break;
1606 case SIAtomicScope::WORKGROUP:
1607 // In threadgroup split mode the waves of a work-group can be executing on
1608 // different CUs. Therefore need to bypass the L1 which is per CU.
1609 // Otherwise in non-threadgroup split mode all waves of a work-group are
1610 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1611 // bits to indicate work-group scope will do this automatically.
1612 Changed |= enableSC0Bit(MI);
1613 break;
1614 case SIAtomicScope::WAVEFRONT:
1615 case SIAtomicScope::SINGLETHREAD:
1616 // Leave SC bits unset to indicate wavefront scope.
1617 break;
1618 default:
1619 llvm_unreachable("Unsupported synchronization scope");
1620 }
1621 }
1622
1623 /// The scratch address space does not need the global memory caches
1624 /// to be bypassed as all memory operations by the same thread are
1625 /// sequentially consistent, and no other thread can access scratch
1626 /// memory.
1627
1628 /// Other address spaces do not have a cache.
1629
1630 return Changed;
1631}
1632
1633bool SIGfx940CacheControl::enableStoreCacheBypass(
1635 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1636 assert(!MI->mayLoad() && MI->mayStore());
1637 bool Changed = false;
1638
1639 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1640 switch (Scope) {
1641 case SIAtomicScope::SYSTEM:
1642 // Set SC bits to indicate system scope.
1643 Changed |= enableSC0Bit(MI);
1644 Changed |= enableSC1Bit(MI);
1645 break;
1646 case SIAtomicScope::AGENT:
1647 // Set SC bits to indicate agent scope.
1648 Changed |= enableSC1Bit(MI);
1649 break;
1650 case SIAtomicScope::WORKGROUP:
1651 // Set SC bits to indicate workgroup scope.
1652 Changed |= enableSC0Bit(MI);
1653 break;
1654 case SIAtomicScope::WAVEFRONT:
1655 case SIAtomicScope::SINGLETHREAD:
1656 // Leave SC bits unset to indicate wavefront scope.
1657 break;
1658 default:
1659 llvm_unreachable("Unsupported synchronization scope");
1660 }
1661 }
1662
1663 /// The scratch address space does not need the global memory caches
1664 /// to be bypassed as all memory operations by the same thread are
1665 /// sequentially consistent, and no other thread can access scratch
1666 /// memory.
1667
1668 /// Other address spaces do not have a cache.
1669
1670 return Changed;
1671}
1672
1673bool SIGfx940CacheControl::enableRMWCacheBypass(
1674 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1675 SIAtomicAddrSpace AddrSpace) const {
1676 assert(MI->mayLoad() && MI->mayStore());
1677 bool Changed = false;
1678
1679 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1680 switch (Scope) {
1681 case SIAtomicScope::SYSTEM:
1682 // Set SC1 bit to indicate system scope.
1683 Changed |= enableSC1Bit(MI);
1684 break;
1685 case SIAtomicScope::AGENT:
1686 case SIAtomicScope::WORKGROUP:
1687 case SIAtomicScope::WAVEFRONT:
1688 case SIAtomicScope::SINGLETHREAD:
1689 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1690 // to indicate system or agent scope. The SC0 bit is used to indicate if
1691 // they are return or no-return. Leave SC1 bit unset to indicate agent
1692 // scope.
1693 break;
1694 default:
1695 llvm_unreachable("Unsupported synchronization scope");
1696 }
1697 }
1698
1699 return Changed;
1700}
1701
1702bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1703 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1704 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1705 // Only handle load and store, not atomic read-modify-write insructions. The
1706 // latter use glc to indicate if the atomic returns a result and so must not
1707 // be used for cache control.
1708 assert(MI->mayLoad() ^ MI->mayStore());
1709
1710 // Only update load and store, not LLVM IR atomic read-modify-write
1711 // instructions. The latter are always marked as volatile so cannot sensibly
1712 // handle it as do not want to pessimize all atomics. Also they do not support
1713 // the nontemporal attribute.
1714 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1715
1716 bool Changed = false;
1717
1718 if (IsVolatile) {
1719 // Set SC bits to indicate system scope.
1720 Changed |= enableSC0Bit(MI);
1721 Changed |= enableSC1Bit(MI);
1722
1723 // Ensure operation has completed at system scope to cause all volatile
1724 // operations to be visible outside the program in a global order. Do not
1725 // request cross address space as only the global address space can be
1726 // observable outside the program, so no need to cause a waitcnt for LDS
1727 // address space operations.
1728 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1729 Position::AFTER);
1730
1731 return Changed;
1732 }
1733
1734 if (IsNonTemporal) {
1735 Changed |= enableNTBit(MI);
1736 return Changed;
1737 }
1738
1739 return Changed;
1740}
1741
1742bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1743 SIAtomicScope Scope,
1744 SIAtomicAddrSpace AddrSpace,
1745 Position Pos) const {
1746 if (!InsertCacheInv)
1747 return false;
1748
1749 bool Changed = false;
1750
1751 MachineBasicBlock &MBB = *MI->getParent();
1752 DebugLoc DL = MI->getDebugLoc();
1753
1754 if (Pos == Position::AFTER)
1755 ++MI;
1756
1757 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1758 switch (Scope) {
1759 case SIAtomicScope::SYSTEM:
1760 // Ensures that following loads will not see stale remote VMEM data or
1761 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1762 // CC will never be stale due to the local memory probes.
1763 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1764 // Set SC bits to indicate system scope.
1766 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1767 // hardware does not reorder memory operations by the same wave with
1768 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1769 // remove any cache lines of earlier writes by the same wave and ensures
1770 // later reads by the same wave will refetch the cache lines.
1771 Changed = true;
1772 break;
1773 case SIAtomicScope::AGENT:
1774 // Ensures that following loads will not see stale remote date or local
1775 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1776 // due to the memory probes.
1777 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1778 // Set SC bits to indicate agent scope.
1780 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1781 // does not reorder memory operations with respect to preceeding buffer
1782 // invalidate. The invalidate is guaranteed to remove any cache lines of
1783 // earlier writes and ensures later writes will refetch the cache lines.
1784 Changed = true;
1785 break;
1786 case SIAtomicScope::WORKGROUP:
1787 // In threadgroup split mode the waves of a work-group can be executing on
1788 // different CUs. Therefore need to invalidate the L1 which is per CU.
1789 // Otherwise in non-threadgroup split mode all waves of a work-group are
1790 // on the same CU, and so the L1 does not need to be invalidated.
1791 if (ST.isTgSplitEnabled()) {
1792 // Ensures L1 is invalidated if in threadgroup split mode. In
1793 // non-threadgroup split mode it is a NOP, but no point generating it in
1794 // that case if know not in that mode.
1795 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1796 // Set SC bits to indicate work-group scope.
1798 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1799 // does not reorder memory operations with respect to preceeding buffer
1800 // invalidate. The invalidate is guaranteed to remove any cache lines of
1801 // earlier writes and ensures later writes will refetch the cache lines.
1802 Changed = true;
1803 }
1804 break;
1805 case SIAtomicScope::WAVEFRONT:
1806 case SIAtomicScope::SINGLETHREAD:
1807 // Could generate "BUFFER_INV" but it would do nothing as there are no
1808 // caches to invalidate.
1809 break;
1810 default:
1811 llvm_unreachable("Unsupported synchronization scope");
1812 }
1813 }
1814
1815 /// The scratch address space does not need the global memory cache
1816 /// to be flushed as all memory operations by the same thread are
1817 /// sequentially consistent, and no other thread can access scratch
1818 /// memory.
1819
1820 /// Other address spaces do not have a cache.
1821
1822 if (Pos == Position::AFTER)
1823 --MI;
1824
1825 return Changed;
1826}
1827
1828bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1829 SIAtomicScope Scope,
1830 SIAtomicAddrSpace AddrSpace,
1831 bool IsCrossAddrSpaceOrdering,
1832 Position Pos) const {
1833 bool Changed = false;
1834
1835 MachineBasicBlock &MBB = *MI->getParent();
1836 DebugLoc DL = MI->getDebugLoc();
1837
1838 if (Pos == Position::AFTER)
1839 ++MI;
1840
1841 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1842 switch (Scope) {
1843 case SIAtomicScope::SYSTEM:
1844 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1845 // hardware does not reorder memory operations by the same wave with
1846 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1847 // to initiate writeback of any dirty cache lines of earlier writes by the
1848 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1849 // writeback has completed.
1850 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1851 // Set SC bits to indicate system scope.
1853 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1854 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1855 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1856 Changed = true;
1857 break;
1858 case SIAtomicScope::AGENT:
1859 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1860 // Set SC bits to indicate agent scope.
1862
1863 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1864 // SIAtomicScope::AGENT, the following insertWait will generate the
1865 // required "S_WAITCNT vmcnt(0)".
1866 Changed = true;
1867 break;
1868 case SIAtomicScope::WORKGROUP:
1869 case SIAtomicScope::WAVEFRONT:
1870 case SIAtomicScope::SINGLETHREAD:
1871 // Do not generate "BUFFER_WBL2" as there are no caches it would
1872 // writeback, and would require an otherwise unnecessary
1873 // "S_WAITCNT vmcnt(0)".
1874 break;
1875 default:
1876 llvm_unreachable("Unsupported synchronization scope");
1877 }
1878 }
1879
1880 if (Pos == Position::AFTER)
1881 --MI;
1882
1883 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1884 // S_WAITCNT needed.
1885 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1886 IsCrossAddrSpaceOrdering, Pos);
1887
1888 return Changed;
1889}
1890
1891bool SIGfx10CacheControl::enableLoadCacheBypass(
1893 SIAtomicScope Scope,
1894 SIAtomicAddrSpace AddrSpace) const {
1895 assert(MI->mayLoad() && !MI->mayStore());
1896 bool Changed = false;
1897
1898 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1899 switch (Scope) {
1900 case SIAtomicScope::SYSTEM:
1901 case SIAtomicScope::AGENT:
1902 // Set the L0 and L1 cache policies to MISS_EVICT.
1903 // Note: there is no L2 cache coherent bypass control at the ISA level.
1904 Changed |= enableGLCBit(MI);
1905 Changed |= enableDLCBit(MI);
1906 break;
1907 case SIAtomicScope::WORKGROUP:
1908 // In WGP mode the waves of a work-group can be executing on either CU of
1909 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1910 // CU mode all waves of a work-group are on the same CU, and so the L0
1911 // does not need to be bypassed.
1912 if (!ST.isCuModeEnabled())
1913 Changed |= enableGLCBit(MI);
1914 break;
1915 case SIAtomicScope::WAVEFRONT:
1916 case SIAtomicScope::SINGLETHREAD:
1917 // No cache to bypass.
1918 break;
1919 default:
1920 llvm_unreachable("Unsupported synchronization scope");
1921 }
1922 }
1923
1924 /// The scratch address space does not need the global memory caches
1925 /// to be bypassed as all memory operations by the same thread are
1926 /// sequentially consistent, and no other thread can access scratch
1927 /// memory.
1928
1929 /// Other address spaces do not have a cache.
1930
1931 return Changed;
1932}
1933
1934bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1935 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1936 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1937
1938 // Only handle load and store, not atomic read-modify-write insructions. The
1939 // latter use glc to indicate if the atomic returns a result and so must not
1940 // be used for cache control.
1941 assert(MI->mayLoad() ^ MI->mayStore());
1942
1943 // Only update load and store, not LLVM IR atomic read-modify-write
1944 // instructions. The latter are always marked as volatile so cannot sensibly
1945 // handle it as do not want to pessimize all atomics. Also they do not support
1946 // the nontemporal attribute.
1947 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1948
1949 bool Changed = false;
1950
1951 if (IsVolatile) {
1952 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1953 // and MISS_LRU for store instructions.
1954 // Note: there is no L2 cache coherent bypass control at the ISA level.
1955 if (Op == SIMemOp::LOAD) {
1956 Changed |= enableGLCBit(MI);
1957 Changed |= enableDLCBit(MI);
1958 }
1959
1960 // Ensure operation has completed at system scope to cause all volatile
1961 // operations to be visible outside the program in a global order. Do not
1962 // request cross address space as only the global address space can be
1963 // observable outside the program, so no need to cause a waitcnt for LDS
1964 // address space operations.
1965 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1966 Position::AFTER);
1967 return Changed;
1968 }
1969
1970 if (IsNonTemporal) {
1971 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1972 // and L2 cache policy to STREAM.
1973 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1974 // to MISS_EVICT and the L2 cache policy to STREAM.
1975 if (Op == SIMemOp::STORE)
1976 Changed |= enableGLCBit(MI);
1977 Changed |= enableSLCBit(MI);
1978
1979 return Changed;
1980 }
1981
1982 return Changed;
1983}
1984
1985bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1986 SIAtomicScope Scope,
1987 SIAtomicAddrSpace AddrSpace,
1988 SIMemOp Op,
1989 bool IsCrossAddrSpaceOrdering,
1990 Position Pos) const {
1991 bool Changed = false;
1992
1993 MachineBasicBlock &MBB = *MI->getParent();
1994 DebugLoc DL = MI->getDebugLoc();
1995
1996 if (Pos == Position::AFTER)
1997 ++MI;
1998
1999 bool VMCnt = false;
2000 bool VSCnt = false;
2001 bool LGKMCnt = false;
2002
2003 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2004 SIAtomicAddrSpace::NONE) {
2005 switch (Scope) {
2006 case SIAtomicScope::SYSTEM:
2007 case SIAtomicScope::AGENT:
2008 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2009 VMCnt |= true;
2010 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2011 VSCnt |= true;
2012 break;
2013 case SIAtomicScope::WORKGROUP:
2014 // In WGP mode the waves of a work-group can be executing on either CU of
2015 // the WGP. Therefore need to wait for operations to complete to ensure
2016 // they are visible to waves in the other CU as the L0 is per CU.
2017 // Otherwise in CU mode and all waves of a work-group are on the same CU
2018 // which shares the same L0.
2019 if (!ST.isCuModeEnabled()) {
2020 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2021 VMCnt |= true;
2022 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2023 VSCnt |= true;
2024 }
2025 break;
2026 case SIAtomicScope::WAVEFRONT:
2027 case SIAtomicScope::SINGLETHREAD:
2028 // The L0 cache keeps all memory operations in order for
2029 // work-items in the same wavefront.
2030 break;
2031 default:
2032 llvm_unreachable("Unsupported synchronization scope");
2033 }
2034 }
2035
2036 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2037 switch (Scope) {
2038 case SIAtomicScope::SYSTEM:
2039 case SIAtomicScope::AGENT:
2040 case SIAtomicScope::WORKGROUP:
2041 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2042 // not needed as LDS operations for all waves are executed in a total
2043 // global ordering as observed by all waves. Required if also
2044 // synchronizing with global/GDS memory as LDS operations could be
2045 // reordered with respect to later global/GDS memory operations of the
2046 // same wave.
2047 LGKMCnt |= IsCrossAddrSpaceOrdering;
2048 break;
2049 case SIAtomicScope::WAVEFRONT:
2050 case SIAtomicScope::SINGLETHREAD:
2051 // The LDS keeps all memory operations in order for
2052 // the same wavefront.
2053 break;
2054 default:
2055 llvm_unreachable("Unsupported synchronization scope");
2056 }
2057 }
2058
2059 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2060 switch (Scope) {
2061 case SIAtomicScope::SYSTEM:
2062 case SIAtomicScope::AGENT:
2063 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2064 // is not needed as GDS operations for all waves are executed in a total
2065 // global ordering as observed by all waves. Required if also
2066 // synchronizing with global/LDS memory as GDS operations could be
2067 // reordered with respect to later global/LDS memory operations of the
2068 // same wave.
2069 LGKMCnt |= IsCrossAddrSpaceOrdering;
2070 break;
2071 case SIAtomicScope::WORKGROUP:
2072 case SIAtomicScope::WAVEFRONT:
2073 case SIAtomicScope::SINGLETHREAD:
2074 // The GDS keeps all memory operations in order for
2075 // the same work-group.
2076 break;
2077 default:
2078 llvm_unreachable("Unsupported synchronization scope");
2079 }
2080 }
2081
2082 if (VMCnt || LGKMCnt) {
2083 unsigned WaitCntImmediate =
2085 VMCnt ? 0 : getVmcntBitMask(IV),
2087 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2088 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2089 .addImm(WaitCntImmediate);
2090 Changed = true;
2091 }
2092
2093 if (VSCnt) {
2094 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2095 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2096 .addImm(0);
2097 Changed = true;
2098 }
2099
2100 if (Pos == Position::AFTER)
2101 --MI;
2102
2103 return Changed;
2104}
2105
2106bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2107 SIAtomicScope Scope,
2108 SIAtomicAddrSpace AddrSpace,
2109 Position Pos) const {
2110 if (!InsertCacheInv)
2111 return false;
2112
2113 bool Changed = false;
2114
2115 MachineBasicBlock &MBB = *MI->getParent();
2116 DebugLoc DL = MI->getDebugLoc();
2117
2118 if (Pos == Position::AFTER)
2119 ++MI;
2120
2121 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2122 switch (Scope) {
2123 case SIAtomicScope::SYSTEM:
2124 case SIAtomicScope::AGENT:
2125 // The order of invalidates matter here. We must invalidate "outer in"
2126 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2127 // invalidated.
2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2129 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2130 Changed = true;
2131 break;
2132 case SIAtomicScope::WORKGROUP:
2133 // In WGP mode the waves of a work-group can be executing on either CU of
2134 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2135 // in CU mode and all waves of a work-group are on the same CU, and so the
2136 // L0 does not need to be invalidated.
2137 if (!ST.isCuModeEnabled()) {
2138 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2139 Changed = true;
2140 }
2141 break;
2142 case SIAtomicScope::WAVEFRONT:
2143 case SIAtomicScope::SINGLETHREAD:
2144 // No cache to invalidate.
2145 break;
2146 default:
2147 llvm_unreachable("Unsupported synchronization scope");
2148 }
2149 }
2150
2151 /// The scratch address space does not need the global memory cache
2152 /// to be flushed as all memory operations by the same thread are
2153 /// sequentially consistent, and no other thread can access scratch
2154 /// memory.
2155
2156 /// Other address spaces do not have a cache.
2157
2158 if (Pos == Position::AFTER)
2159 --MI;
2160
2161 return Changed;
2162}
2163
2164bool SIGfx11CacheControl::enableLoadCacheBypass(
2165 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2166 SIAtomicAddrSpace AddrSpace) const {
2167 assert(MI->mayLoad() && !MI->mayStore());
2168 bool Changed = false;
2169
2170 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2171 switch (Scope) {
2172 case SIAtomicScope::SYSTEM:
2173 case SIAtomicScope::AGENT:
2174 // Set the L0 and L1 cache policies to MISS_EVICT.
2175 // Note: there is no L2 cache coherent bypass control at the ISA level.
2176 Changed |= enableGLCBit(MI);
2177 break;
2178 case SIAtomicScope::WORKGROUP:
2179 // In WGP mode the waves of a work-group can be executing on either CU of
2180 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2181 // CU mode all waves of a work-group are on the same CU, and so the L0
2182 // does not need to be bypassed.
2183 if (!ST.isCuModeEnabled())
2184 Changed |= enableGLCBit(MI);
2185 break;
2186 case SIAtomicScope::WAVEFRONT:
2187 case SIAtomicScope::SINGLETHREAD:
2188 // No cache to bypass.
2189 break;
2190 default:
2191 llvm_unreachable("Unsupported synchronization scope");
2192 }
2193 }
2194
2195 /// The scratch address space does not need the global memory caches
2196 /// to be bypassed as all memory operations by the same thread are
2197 /// sequentially consistent, and no other thread can access scratch
2198 /// memory.
2199
2200 /// Other address spaces do not have a cache.
2201
2202 return Changed;
2203}
2204
2205bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2206 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2207 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2208
2209 // Only handle load and store, not atomic read-modify-write insructions. The
2210 // latter use glc to indicate if the atomic returns a result and so must not
2211 // be used for cache control.
2212 assert(MI->mayLoad() ^ MI->mayStore());
2213
2214 // Only update load and store, not LLVM IR atomic read-modify-write
2215 // instructions. The latter are always marked as volatile so cannot sensibly
2216 // handle it as do not want to pessimize all atomics. Also they do not support
2217 // the nontemporal attribute.
2218 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2219
2220 bool Changed = false;
2221
2222 if (IsVolatile) {
2223 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2224 // and MISS_LRU for store instructions.
2225 // Note: there is no L2 cache coherent bypass control at the ISA level.
2226 if (Op == SIMemOp::LOAD)
2227 Changed |= enableGLCBit(MI);
2228
2229 // Set MALL NOALLOC for load and store instructions.
2230 Changed |= enableDLCBit(MI);
2231
2232 // Ensure operation has completed at system scope to cause all volatile
2233 // operations to be visible outside the program in a global order. Do not
2234 // request cross address space as only the global address space can be
2235 // observable outside the program, so no need to cause a waitcnt for LDS
2236 // address space operations.
2237 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2238 Position::AFTER);
2239 return Changed;
2240 }
2241
2242 if (IsNonTemporal) {
2243 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2244 // and L2 cache policy to STREAM.
2245 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2246 // to MISS_EVICT and the L2 cache policy to STREAM.
2247 if (Op == SIMemOp::STORE)
2248 Changed |= enableGLCBit(MI);
2249 Changed |= enableSLCBit(MI);
2250
2251 // Set MALL NOALLOC for load and store instructions.
2252 Changed |= enableDLCBit(MI);
2253 return Changed;
2254 }
2255
2256 return Changed;
2257}
2258
2259bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2260 AMDGPU::CPol::CPol Value) const {
2261 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2262 if (!CPol)
2263 return false;
2264
2266 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2267 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2268 return true;
2269 }
2270
2271 return false;
2272}
2273
2274bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2275 AMDGPU::CPol::CPol Value) const {
2276 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2277 if (!CPol)
2278 return false;
2279
2280 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2281 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2282 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2283 return true;
2284 }
2285
2286 return false;
2287}
2288
2289bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2290 const MachineBasicBlock::iterator MI) const {
2291 // TODO: implement flag for frontend to give us a hint not to insert waits.
2292
2293 MachineBasicBlock &MBB = *MI->getParent();
2294 const DebugLoc &DL = MI->getDebugLoc();
2295
2296 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2297 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2298 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2299 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2300 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2301
2302 return true;
2303}
2304
2305bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2306 SIAtomicScope Scope,
2307 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2308 bool IsCrossAddrSpaceOrdering,
2309 Position Pos) const {
2310 bool Changed = false;
2311
2312 MachineBasicBlock &MBB = *MI->getParent();
2313 DebugLoc DL = MI->getDebugLoc();
2314
2315 bool LOADCnt = false;
2316 bool DSCnt = false;
2317 bool STORECnt = false;
2318
2319 if (Pos == Position::AFTER)
2320 ++MI;
2321
2322 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2323 SIAtomicAddrSpace::NONE) {
2324 switch (Scope) {
2325 case SIAtomicScope::SYSTEM:
2326 case SIAtomicScope::AGENT:
2327 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2328 LOADCnt |= true;
2329 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2330 STORECnt |= true;
2331 break;
2332 case SIAtomicScope::WORKGROUP:
2333 // In WGP mode the waves of a work-group can be executing on either CU of
2334 // the WGP. Therefore need to wait for operations to complete to ensure
2335 // they are visible to waves in the other CU as the L0 is per CU.
2336 // Otherwise in CU mode and all waves of a work-group are on the same CU
2337 // which shares the same L0.
2338 if (!ST.isCuModeEnabled()) {
2339 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2340 LOADCnt |= true;
2341 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2342 STORECnt |= true;
2343 }
2344 break;
2345 case SIAtomicScope::WAVEFRONT:
2346 case SIAtomicScope::SINGLETHREAD:
2347 // The L0 cache keeps all memory operations in order for
2348 // work-items in the same wavefront.
2349 break;
2350 default:
2351 llvm_unreachable("Unsupported synchronization scope");
2352 }
2353 }
2354
2355 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2356 switch (Scope) {
2357 case SIAtomicScope::SYSTEM:
2358 case SIAtomicScope::AGENT:
2359 case SIAtomicScope::WORKGROUP:
2360 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2361 // not needed as LDS operations for all waves are executed in a total
2362 // global ordering as observed by all waves. Required if also
2363 // synchronizing with global/GDS memory as LDS operations could be
2364 // reordered with respect to later global/GDS memory operations of the
2365 // same wave.
2366 DSCnt |= IsCrossAddrSpaceOrdering;
2367 break;
2368 case SIAtomicScope::WAVEFRONT:
2369 case SIAtomicScope::SINGLETHREAD:
2370 // The LDS keeps all memory operations in order for
2371 // the same wavefront.
2372 break;
2373 default:
2374 llvm_unreachable("Unsupported synchronization scope");
2375 }
2376 }
2377
2378 if (LOADCnt) {
2379 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2380 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2381 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2382 Changed = true;
2383 }
2384
2385 if (STORECnt) {
2386 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2387 Changed = true;
2388 }
2389
2390 if (DSCnt) {
2391 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2392 Changed = true;
2393 }
2394
2395 if (Pos == Position::AFTER)
2396 --MI;
2397
2398 return Changed;
2399}
2400
2401bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2402 SIAtomicScope Scope,
2403 SIAtomicAddrSpace AddrSpace,
2404 Position Pos) const {
2405 if (!InsertCacheInv)
2406 return false;
2407
2408 MachineBasicBlock &MBB = *MI->getParent();
2409 DebugLoc DL = MI->getDebugLoc();
2410
2411 /// The scratch address space does not need the global memory cache
2412 /// to be flushed as all memory operations by the same thread are
2413 /// sequentially consistent, and no other thread can access scratch
2414 /// memory.
2415
2416 /// Other address spaces do not have a cache.
2417 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2418 return false;
2419
2421 switch (Scope) {
2422 case SIAtomicScope::SYSTEM:
2423 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2424 break;
2425 case SIAtomicScope::AGENT:
2426 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2427 break;
2428 case SIAtomicScope::WORKGROUP:
2429 // In WGP mode the waves of a work-group can be executing on either CU of
2430 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2431 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2432 // the L0 does not need to be invalidated.
2433 if (ST.isCuModeEnabled())
2434 return false;
2435
2436 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2437 break;
2438 case SIAtomicScope::WAVEFRONT:
2439 case SIAtomicScope::SINGLETHREAD:
2440 // No cache to invalidate.
2441 return false;
2442 default:
2443 llvm_unreachable("Unsupported synchronization scope");
2444 }
2445
2446 if (Pos == Position::AFTER)
2447 ++MI;
2448
2449 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2450
2451 if (Pos == Position::AFTER)
2452 --MI;
2453
2454 return true;
2455}
2456
2457bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2458 SIAtomicScope Scope,
2459 SIAtomicAddrSpace AddrSpace,
2460 bool IsCrossAddrSpaceOrdering,
2461 Position Pos) const {
2462 MachineBasicBlock &MBB = *MI->getParent();
2463 DebugLoc DL = MI->getDebugLoc();
2464
2465 // The scratch address space does not need the global memory cache
2466 // writeback as all memory operations by the same thread are
2467 // sequentially consistent, and no other thread can access scratch
2468 // memory.
2469
2470 // Other address spaces do not have a cache.
2471 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2472 return false;
2473
2474 if (Pos == Position::AFTER)
2475 ++MI;
2476
2477 // GLOBAL_WB is always needed, even for write-through caches, as it
2478 // additionally ensures all operations have reached the desired cache level.
2479 bool SkipWB = false;
2481 switch (Scope) {
2482 case SIAtomicScope::SYSTEM:
2483 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2484 break;
2485 case SIAtomicScope::AGENT:
2486 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2487 break;
2488 case SIAtomicScope::WORKGROUP:
2489 // In WGP mode the waves of a work-group can be executing on either CU of
2490 // the WGP. Therefore we need to ensure all operations have reached L1,
2491 // hence the SCOPE_SE WB.
2492 // For CU mode, we need operations to reach L0, so the wait is enough -
2493 // there are no ways for an operation to report completion without reaching
2494 // at least L0.
2495 if (ST.isCuModeEnabled())
2496 SkipWB = true;
2497 else
2498 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2499 break;
2500 case SIAtomicScope::WAVEFRONT:
2501 case SIAtomicScope::SINGLETHREAD:
2502 // No cache to invalidate.
2503 return false;
2504 default:
2505 llvm_unreachable("Unsupported synchronization scope");
2506 }
2507
2508 if (!SkipWB)
2509 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2510
2511 if (Pos == Position::AFTER)
2512 --MI;
2513
2514 // We always have to wait for previous memory operations (load/store) to
2515 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2516 // we of course need to wait for that as well.
2517 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2518 IsCrossAddrSpaceOrdering, Pos);
2519
2520 return true;
2521}
2522
2523bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2524 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2525 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2526
2527 // Only handle load and store, not atomic read-modify-write instructions.
2528 assert(MI->mayLoad() ^ MI->mayStore());
2529
2530 // Only update load and store, not LLVM IR atomic read-modify-write
2531 // instructions. The latter are always marked as volatile so cannot sensibly
2532 // handle it as do not want to pessimize all atomics. Also they do not support
2533 // the nontemporal attribute.
2534 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2535
2536 bool Changed = false;
2537
2538 if (IsLastUse) {
2539 // Set last-use hint.
2540 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2541 } else if (IsNonTemporal) {
2542 // Set non-temporal hint for all cache levels.
2543 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2544 }
2545
2546 if (IsVolatile) {
2547 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2548
2549 if (Op == SIMemOp::STORE)
2550 Changed |= insertWaitsBeforeSystemScopeStore(MI);
2551
2552 // Ensure operation has completed at system scope to cause all volatile
2553 // operations to be visible outside the program in a global order. Do not
2554 // request cross address space as only the global address space can be
2555 // observable outside the program, so no need to cause a waitcnt for LDS
2556 // address space operations.
2557 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2558 Position::AFTER);
2559 }
2560
2561 return Changed;
2562}
2563
2564bool SIGfx12CacheControl::expandSystemScopeStore(
2566 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2567 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2568 return insertWaitsBeforeSystemScopeStore(MI);
2569
2570 return false;
2571}
2572
2573bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2574 SIAtomicScope Scope,
2575 SIAtomicAddrSpace AddrSpace) const {
2576 bool Changed = false;
2577
2578 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2579 switch (Scope) {
2580 case SIAtomicScope::SYSTEM:
2581 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2582 break;
2583 case SIAtomicScope::AGENT:
2584 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2585 break;
2586 case SIAtomicScope::WORKGROUP:
2587 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2588 // different CUs that access different L0s.
2589 if (!ST.isCuModeEnabled())
2590 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2591 break;
2592 case SIAtomicScope::WAVEFRONT:
2593 case SIAtomicScope::SINGLETHREAD:
2594 // No cache to bypass.
2595 break;
2596 default:
2597 llvm_unreachable("Unsupported synchronization scope");
2598 }
2599 }
2600
2601 // The scratch address space does not need the global memory caches
2602 // to be bypassed as all memory operations by the same thread are
2603 // sequentially consistent, and no other thread can access scratch
2604 // memory.
2605
2606 // Other address spaces do not have a cache.
2607
2608 return Changed;
2609}
2610
2611bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2612 if (AtomicPseudoMIs.empty())
2613 return false;
2614
2615 for (auto &MI : AtomicPseudoMIs)
2616 MI->eraseFromParent();
2617
2618 AtomicPseudoMIs.clear();
2619 return true;
2620}
2621
2622bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2624 assert(MI->mayLoad() && !MI->mayStore());
2625
2626 bool Changed = false;
2627
2628 if (MOI.isAtomic()) {
2629 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2630 MOI.getOrdering() == AtomicOrdering::Acquire ||
2631 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2632 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2633 MOI.getOrderingAddrSpace());
2634 }
2635
2636 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2637 Changed |= CC->insertWait(MI, MOI.getScope(),
2638 MOI.getOrderingAddrSpace(),
2639 SIMemOp::LOAD | SIMemOp::STORE,
2640 MOI.getIsCrossAddressSpaceOrdering(),
2641 Position::BEFORE);
2642
2643 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2644 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2645 Changed |= CC->insertWait(MI, MOI.getScope(),
2646 MOI.getInstrAddrSpace(),
2647 SIMemOp::LOAD,
2648 MOI.getIsCrossAddressSpaceOrdering(),
2649 Position::AFTER);
2650 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2651 MOI.getOrderingAddrSpace(),
2652 Position::AFTER);
2653 }
2654
2655 return Changed;
2656 }
2657
2658 // Atomic instructions already bypass caches to the scope specified by the
2659 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2660 // instructions need additional treatment.
2661 Changed |= CC->enableVolatileAndOrNonTemporal(
2662 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2663 MOI.isNonTemporal(), MOI.isLastUse());
2664
2665 return Changed;
2666}
2667
2668bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2670 assert(!MI->mayLoad() && MI->mayStore());
2671
2672 bool Changed = false;
2673
2674 if (MOI.isAtomic()) {
2675 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2676 MOI.getOrdering() == AtomicOrdering::Release ||
2677 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2678 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2679 MOI.getOrderingAddrSpace());
2680 }
2681
2682 if (MOI.getOrdering() == AtomicOrdering::Release ||
2683 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2684 Changed |= CC->insertRelease(MI, MOI.getScope(),
2685 MOI.getOrderingAddrSpace(),
2686 MOI.getIsCrossAddressSpaceOrdering(),
2687 Position::BEFORE);
2688
2689 return Changed;
2690 }
2691
2692 // Atomic instructions already bypass caches to the scope specified by the
2693 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2694 // need additional treatment.
2695 Changed |= CC->enableVolatileAndOrNonTemporal(
2696 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2697 MOI.isNonTemporal());
2698
2699 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2700 // instruction field, do not confuse it with atomic scope.
2701 Changed |= CC->expandSystemScopeStore(MI);
2702 return Changed;
2703}
2704
2705bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2707 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2708
2709 AtomicPseudoMIs.push_back(MI);
2710 bool Changed = false;
2711
2712 // Refine fenced address space based on MMRAs.
2713 //
2714 // TODO: Should we support this MMRA on other atomic operations?
2715 auto OrderingAddrSpace =
2716 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2717
2718 if (MOI.isAtomic()) {
2719 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2720 Changed |= CC->insertWait(
2721 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2722 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
2723
2724 if (MOI.getOrdering() == AtomicOrdering::Release ||
2725 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2726 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2727 /// TODO: This relies on a barrier always generating a waitcnt
2728 /// for LDS to ensure it is not reordered with the completion of
2729 /// the proceeding LDS operations. If barrier had a memory
2730 /// ordering and memory scope, then library does not need to
2731 /// generate a fence. Could add support in this file for
2732 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2733 /// adding S_WAITCNT before a S_BARRIER.
2734 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2735 MOI.getIsCrossAddressSpaceOrdering(),
2736 Position::BEFORE);
2737
2738 // TODO: If both release and invalidate are happening they could be combined
2739 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2740 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2741 // track cache invalidate and write back instructions.
2742
2743 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2744 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2745 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2746 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2747 Position::BEFORE);
2748
2749 return Changed;
2750 }
2751
2752 return Changed;
2753}
2754
2755bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2757 assert(MI->mayLoad() && MI->mayStore());
2758
2759 bool Changed = false;
2760
2761 if (MOI.isAtomic()) {
2762 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2763 MOI.getOrdering() == AtomicOrdering::Acquire ||
2764 MOI.getOrdering() == AtomicOrdering::Release ||
2765 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2766 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2767 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2768 MOI.getInstrAddrSpace());
2769 }
2770
2771 if (MOI.getOrdering() == AtomicOrdering::Release ||
2772 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2773 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2774 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2775 Changed |= CC->insertRelease(MI, MOI.getScope(),
2776 MOI.getOrderingAddrSpace(),
2777 MOI.getIsCrossAddressSpaceOrdering(),
2778 Position::BEFORE);
2779
2780 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2781 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2782 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2783 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2784 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2785 Changed |= CC->insertWait(MI, MOI.getScope(),
2786 MOI.getInstrAddrSpace(),
2787 isAtomicRet(*MI) ? SIMemOp::LOAD :
2788 SIMemOp::STORE,
2789 MOI.getIsCrossAddressSpaceOrdering(),
2790 Position::AFTER);
2791 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2792 MOI.getOrderingAddrSpace(),
2793 Position::AFTER);
2794 }
2795
2796 return Changed;
2797 }
2798
2799 return Changed;
2800}
2801
2802bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2803 bool Changed = false;
2804
2805 SIMemOpAccess MOA(MF);
2806 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2807
2808 for (auto &MBB : MF) {
2809 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2810
2811 // Unbundle instructions after the post-RA scheduler.
2812 if (MI->isBundle() && MI->mayLoadOrStore()) {
2813 MachineBasicBlock::instr_iterator II(MI->getIterator());
2815 I != E && I->isBundledWithPred(); ++I) {
2816 I->unbundleFromPred();
2817 for (MachineOperand &MO : I->operands())
2818 if (MO.isReg())
2819 MO.setIsInternalRead(false);
2820 }
2821
2822 MI->eraseFromParent();
2823 MI = II->getIterator();
2824 }
2825
2826 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2827 continue;
2828
2829 if (const auto &MOI = MOA.getLoadInfo(MI))
2830 Changed |= expandLoad(*MOI, MI);
2831 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2832 Changed |= expandStore(*MOI, MI);
2833 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2834 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2835 Changed |= expandAtomicFence(*MOI, MI);
2836 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2837 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2838 }
2839 }
2840
2841 Changed |= removeAtomicPseudoMIs();
2842 return Changed;
2843}
2844
2845INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2846
2847char SIMemoryLegalizer::ID = 0;
2848char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2849
2851 return new SIMemoryLegalizer();
2852}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
@ Default
Definition: DwarfDebug.cpp:87
std::string Name
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
#define DEBUG_TYPE
static SPIRV::Scope::Scope getScope(SyncScope::ID Ord, SPIRVMachineModuleInfo *MMI)
raw_pwrite_stream & OS
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineModuleInfo & getMMI() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
LLVM Value Representation.
Definition: Value.h:74
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
@ NONE
Definition: Attributor.h:6417
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ DS_Warning
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
FunctionPass * createSIMemoryLegalizerPass()
Instruction set architecture version.
Definition: TargetParser.h:127