LLVM 19.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
26
27using namespace llvm;
28using namespace llvm::AMDGPU;
29
30#define DEBUG_TYPE "si-memory-legalizer"
31#define PASS_NAME "SI Memory Legalizer"
32
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37namespace {
38
40
41/// Memory operation flags. Can be ORed together.
42enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47};
48
49/// Position to insert a new instruction relative to an existing
50/// instruction.
51enum class Position {
52 BEFORE,
53 AFTER
54};
55
56/// The atomic synchronization scopes supported by the AMDGPU target.
57enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
62 AGENT,
63 SYSTEM
64};
65
66/// The distinct address spaces supported by the AMDGPU target for
67/// atomic memory operation. Can be ORed together.
68enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86};
87
88class SIMemOpInfo final {
89private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101
102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106 bool IsCrossAddressSpaceOrdering = true,
107 AtomicOrdering FailureOrdering =
108 AtomicOrdering::SequentiallyConsistent,
109 bool IsVolatile = false,
110 bool IsNonTemporal = false)
111 : Ordering(Ordering), FailureOrdering(FailureOrdering),
112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113 InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
116 IsNonTemporal(IsNonTemporal) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SIMemOpInfo.
176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SIMemOpInfo.
182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210};
211
212class SIMemOpAccess final {
213private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 std::optional<SIMemOpInfo>
233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234
235public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241 std::optional<SIMemOpInfo>
243
244 /// \returns Store info if \p MI is a store operation, "std::nullopt"
245 /// otherwise.
246 std::optional<SIMemOpInfo>
247 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248
249 /// \returns Atomic fence info if \p MI is an atomic fence operation,
250 /// "std::nullopt" otherwise.
251 std::optional<SIMemOpInfo>
252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253
254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255 /// rmw operation, "std::nullopt" otherwise.
256 std::optional<SIMemOpInfo>
257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258};
259
260class SICacheControl {
261protected:
262
263 /// AMDGPU subtarget info.
264 const GCNSubtarget &ST;
265
266 /// Instruction info.
267 const SIInstrInfo *TII = nullptr;
268
269 IsaVersion IV;
270
271 /// Whether to insert cache invalidating instructions.
272 bool InsertCacheInv;
273
274 SICacheControl(const GCNSubtarget &ST);
275
276 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277 /// \returns Returns true if \p MI is modified, false otherwise.
278 bool enableNamedBit(const MachineBasicBlock::iterator MI,
279 AMDGPU::CPol::CPol Bit) const;
280
281public:
282
283 /// Create a cache control for the subtarget \p ST.
284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285
286 /// Update \p MI memory load instruction to bypass any caches up to
287 /// the \p Scope memory scope for address spaces \p
288 /// AddrSpace. Return true iff the instruction was modified.
289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290 SIAtomicScope Scope,
291 SIAtomicAddrSpace AddrSpace) const = 0;
292
293 /// Update \p MI memory store instruction to bypass any caches up to
294 /// the \p Scope memory scope for address spaces \p
295 /// AddrSpace. Return true iff the instruction was modified.
296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297 SIAtomicScope Scope,
298 SIAtomicAddrSpace AddrSpace) const = 0;
299
300 /// Update \p MI memory read-modify-write instruction to bypass any caches up
301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302 /// iff the instruction was modified.
303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304 SIAtomicScope Scope,
305 SIAtomicAddrSpace AddrSpace) const = 0;
306
307 /// Update \p MI memory instruction of kind \p Op associated with address
308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309 /// true iff the instruction was modified.
310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311 SIAtomicAddrSpace AddrSpace,
312 SIMemOp Op, bool IsVolatile,
313 bool IsNonTemporal) const = 0;
314
315 /// Inserts any necessary instructions at position \p Pos relative
316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317 /// \p Op associated with address spaces \p AddrSpace have completed. Used
318 /// between memory instructions to enforce the order they become visible as
319 /// observed by other memory instructions executing in memory scope \p Scope.
320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321 /// address spaces. Returns true iff any instructions inserted.
322 virtual bool insertWait(MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace,
325 SIMemOp Op,
326 bool IsCrossAddrSpaceOrdering,
327 Position Pos) const = 0;
328
329 /// Inserts any necessary instructions at position \p Pos relative to
330 /// instruction \p MI to ensure any subsequent memory instructions of this
331 /// thread with address spaces \p AddrSpace will observe the previous memory
332 /// operations by any thread for memory scopes up to memory scope \p Scope .
333 /// Returns true iff any instructions inserted.
334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335 SIAtomicScope Scope,
336 SIAtomicAddrSpace AddrSpace,
337 Position Pos) const = 0;
338
339 /// Inserts any necessary instructions at position \p Pos relative to
340 /// instruction \p MI to ensure previous memory instructions by this thread
341 /// with address spaces \p AddrSpace have completed and can be observed by
342 /// subsequent memory instructions by any thread executing in memory scope \p
343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344 /// between address spaces. Returns true iff any instructions inserted.
345 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 bool IsCrossAddrSpaceOrdering,
349 Position Pos) const = 0;
350
351 /// Virtual destructor to allow derivations to be deleted.
352 virtual ~SICacheControl() = default;
353
354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
356 return false;
357 }
358};
359
360class SIGfx6CacheControl : public SICacheControl {
361protected:
362
363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364 /// is modified, false otherwise.
365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366 return enableNamedBit(MI, AMDGPU::CPol::GLC);
367 }
368
369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370 /// is modified, false otherwise.
371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372 return enableNamedBit(MI, AMDGPU::CPol::SLC);
373 }
374
375public:
376
377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378
379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const override;
382
383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace) const override;
386
387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388 SIAtomicScope Scope,
389 SIAtomicAddrSpace AddrSpace) const override;
390
391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393 bool IsVolatile,
394 bool IsNonTemporal) const override;
395
396 bool insertWait(MachineBasicBlock::iterator &MI,
397 SIAtomicScope Scope,
398 SIAtomicAddrSpace AddrSpace,
399 SIMemOp Op,
400 bool IsCrossAddrSpaceOrdering,
401 Position Pos) const override;
402
403 bool insertAcquire(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 Position Pos) const override;
407
408 bool insertRelease(MachineBasicBlock::iterator &MI,
409 SIAtomicScope Scope,
410 SIAtomicAddrSpace AddrSpace,
411 bool IsCrossAddrSpaceOrdering,
412 Position Pos) const override;
413};
414
415class SIGfx7CacheControl : public SIGfx6CacheControl {
416public:
417
418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419
420 bool insertAcquire(MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace,
423 Position Pos) const override;
424
425};
426
427class SIGfx90ACacheControl : public SIGfx7CacheControl {
428public:
429
430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431
432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace) const override;
435
436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437 SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace) const override;
439
440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace) const override;
443
444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446 bool IsVolatile,
447 bool IsNonTemporal) const override;
448
449 bool insertWait(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 SIMemOp Op,
453 bool IsCrossAddrSpaceOrdering,
454 Position Pos) const override;
455
456 bool insertAcquire(MachineBasicBlock::iterator &MI,
457 SIAtomicScope Scope,
458 SIAtomicAddrSpace AddrSpace,
459 Position Pos) const override;
460
461 bool insertRelease(MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace,
464 bool IsCrossAddrSpaceOrdering,
465 Position Pos) const override;
466};
467
468class SIGfx940CacheControl : public SIGfx90ACacheControl {
469protected:
470
471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472 /// is modified, false otherwise.
473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474 return enableNamedBit(MI, AMDGPU::CPol::SC0);
475 }
476
477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478 /// is modified, false otherwise.
479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480 return enableNamedBit(MI, AMDGPU::CPol::SC1);
481 }
482
483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484 /// is modified, false otherwise.
485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486 return enableNamedBit(MI, AMDGPU::CPol::NT);
487 }
488
489public:
490
491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492
493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494 SIAtomicScope Scope,
495 SIAtomicAddrSpace AddrSpace) const override;
496
497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace) const override;
500
501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502 SIAtomicScope Scope,
503 SIAtomicAddrSpace AddrSpace) const override;
504
505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507 bool IsVolatile,
508 bool IsNonTemporal) const override;
509
510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512
513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515 Position Pos) const override;
516
517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518 MachineBasicBlock::iterator &MI) const override {
519 bool Changed = false;
520 if (ST.hasForceStoreSC0SC1() &&
521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522 SIAtomicAddrSpace::GLOBAL |
523 SIAtomicAddrSpace::OTHER)) !=
524 SIAtomicAddrSpace::NONE) {
525 Changed |= enableSC0Bit(MI);
526 Changed |= enableSC1Bit(MI);
527 }
528 return Changed;
529 }
530};
531
532class SIGfx10CacheControl : public SIGfx7CacheControl {
533protected:
534
535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536 /// is modified, false otherwise.
537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538 return enableNamedBit(MI, AMDGPU::CPol::DLC);
539 }
540
541public:
542
543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544
545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546 SIAtomicScope Scope,
547 SIAtomicAddrSpace AddrSpace) const override;
548
549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551 bool IsVolatile,
552 bool IsNonTemporal) const override;
553
554 bool insertWait(MachineBasicBlock::iterator &MI,
555 SIAtomicScope Scope,
556 SIAtomicAddrSpace AddrSpace,
557 SIMemOp Op,
558 bool IsCrossAddrSpaceOrdering,
559 Position Pos) const override;
560
561 bool insertAcquire(MachineBasicBlock::iterator &MI,
562 SIAtomicScope Scope,
563 SIAtomicAddrSpace AddrSpace,
564 Position Pos) const override;
565};
566
567class SIGfx11CacheControl : public SIGfx10CacheControl {
568public:
569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570
571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572 SIAtomicScope Scope,
573 SIAtomicAddrSpace AddrSpace) const override;
574
575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577 bool IsVolatile,
578 bool IsNonTemporal) const override;
579};
580
581class SIGfx12CacheControl : public SIGfx11CacheControl {
582protected:
583 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
584 // \returns Returns true if \p MI is modified, false otherwise.
585 bool setTH(const MachineBasicBlock::iterator MI,
587 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
588 // MI. \returns Returns true if \p MI is modified, false otherwise.
589 bool setScope(const MachineBasicBlock::iterator MI,
591
592public:
593 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
594
595 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
596 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
597 bool IsCrossAddrSpaceOrdering, Position Pos) const override;
598
599 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
600 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
601
602 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
603 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
604 bool IsVolatile,
605 bool IsNonTemporal) const override;
606};
607
608class SIMemoryLegalizer final : public MachineFunctionPass {
609private:
610
611 /// Cache Control.
612 std::unique_ptr<SICacheControl> CC = nullptr;
613
614 /// List of atomic pseudo instructions.
615 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
616
617 /// Return true iff instruction \p MI is a atomic instruction that
618 /// returns a result.
619 bool isAtomicRet(const MachineInstr &MI) const {
621 }
622
623 /// Removes all processed atomic pseudo instructions from the current
624 /// function. Returns true if current function is modified, false otherwise.
625 bool removeAtomicPseudoMIs();
626
627 /// Expands load operation \p MI. Returns true if instructions are
628 /// added/deleted or \p MI is modified, false otherwise.
629 bool expandLoad(const SIMemOpInfo &MOI,
631 /// Expands store operation \p MI. Returns true if instructions are
632 /// added/deleted or \p MI is modified, false otherwise.
633 bool expandStore(const SIMemOpInfo &MOI,
635 /// Expands atomic fence operation \p MI. Returns true if
636 /// instructions are added/deleted or \p MI is modified, false otherwise.
637 bool expandAtomicFence(const SIMemOpInfo &MOI,
639 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
640 /// instructions are added/deleted or \p MI is modified, false otherwise.
641 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
643
644public:
645 static char ID;
646
647 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
648
649 void getAnalysisUsage(AnalysisUsage &AU) const override {
650 AU.setPreservesCFG();
652 }
653
654 StringRef getPassName() const override {
655 return PASS_NAME;
656 }
657
658 bool runOnMachineFunction(MachineFunction &MF) override;
659};
660
661} // end namespace anonymous
662
663void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
664 const char *Msg) const {
665 const Function &Func = MI->getParent()->getParent()->getFunction();
666 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
667 Func.getContext().diagnose(Diag);
668}
669
670std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
671SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
672 SIAtomicAddrSpace InstrAddrSpace) const {
673 if (SSID == SyncScope::System)
674 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
675 if (SSID == MMI->getAgentSSID())
676 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
677 if (SSID == MMI->getWorkgroupSSID())
678 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
679 true);
680 if (SSID == MMI->getWavefrontSSID())
681 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
682 true);
683 if (SSID == SyncScope::SingleThread)
684 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
685 true);
686 if (SSID == MMI->getSystemOneAddressSpaceSSID())
687 return std::tuple(SIAtomicScope::SYSTEM,
688 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
689 if (SSID == MMI->getAgentOneAddressSpaceSSID())
690 return std::tuple(SIAtomicScope::AGENT,
691 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
692 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
693 return std::tuple(SIAtomicScope::WORKGROUP,
694 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
695 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
696 return std::tuple(SIAtomicScope::WAVEFRONT,
697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
699 return std::tuple(SIAtomicScope::SINGLETHREAD,
700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701 return std::nullopt;
702}
703
704SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
705 if (AS == AMDGPUAS::FLAT_ADDRESS)
706 return SIAtomicAddrSpace::FLAT;
707 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
708 return SIAtomicAddrSpace::GLOBAL;
709 if (AS == AMDGPUAS::LOCAL_ADDRESS)
710 return SIAtomicAddrSpace::LDS;
712 return SIAtomicAddrSpace::SCRATCH;
713 if (AS == AMDGPUAS::REGION_ADDRESS)
714 return SIAtomicAddrSpace::GDS;
715
716 return SIAtomicAddrSpace::OTHER;
717}
718
719SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
721}
722
723std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
724 const MachineBasicBlock::iterator &MI) const {
725 assert(MI->getNumMemOperands() > 0);
726
728 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
729 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
730 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
731 bool IsNonTemporal = true;
732 bool IsVolatile = false;
733
734 // Validator should check whether or not MMOs cover the entire set of
735 // locations accessed by the memory instruction.
736 for (const auto &MMO : MI->memoperands()) {
737 IsNonTemporal &= MMO->isNonTemporal();
738 IsVolatile |= MMO->isVolatile();
739 InstrAddrSpace |=
740 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
741 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
742 if (OpOrdering != AtomicOrdering::NotAtomic) {
743 const auto &IsSyncScopeInclusion =
744 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
745 if (!IsSyncScopeInclusion) {
746 reportUnsupported(MI,
747 "Unsupported non-inclusive atomic synchronization scope");
748 return std::nullopt;
749 }
750
751 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
752 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
753 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
754 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
755 FailureOrdering =
756 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
757 }
758 }
759
760 SIAtomicScope Scope = SIAtomicScope::NONE;
761 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
762 bool IsCrossAddressSpaceOrdering = false;
763 if (Ordering != AtomicOrdering::NotAtomic) {
764 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
765 if (!ScopeOrNone) {
766 reportUnsupported(MI, "Unsupported atomic synchronization scope");
767 return std::nullopt;
768 }
769 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
770 *ScopeOrNone;
771 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
772 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
773 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
774 reportUnsupported(MI, "Unsupported atomic address space");
775 return std::nullopt;
776 }
777 }
778 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
779 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
780 IsNonTemporal);
781}
782
783std::optional<SIMemOpInfo>
784SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
785 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
786
787 if (!(MI->mayLoad() && !MI->mayStore()))
788 return std::nullopt;
789
790 // Be conservative if there are no memory operands.
791 if (MI->getNumMemOperands() == 0)
792 return SIMemOpInfo();
793
794 return constructFromMIWithMMO(MI);
795}
796
797std::optional<SIMemOpInfo>
798SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
799 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
800
801 if (!(!MI->mayLoad() && MI->mayStore()))
802 return std::nullopt;
803
804 // Be conservative if there are no memory operands.
805 if (MI->getNumMemOperands() == 0)
806 return SIMemOpInfo();
807
808 return constructFromMIWithMMO(MI);
809}
810
811std::optional<SIMemOpInfo>
812SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
813 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
814
815 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
816 return std::nullopt;
817
818 AtomicOrdering Ordering =
819 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
820
821 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
822 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
823 if (!ScopeOrNone) {
824 reportUnsupported(MI, "Unsupported atomic synchronization scope");
825 return std::nullopt;
826 }
827
828 SIAtomicScope Scope = SIAtomicScope::NONE;
829 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
830 bool IsCrossAddressSpaceOrdering = false;
831 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
832 *ScopeOrNone;
833
834 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
835 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
836 reportUnsupported(MI, "Unsupported atomic address space");
837 return std::nullopt;
838 }
839
840 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
841 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
842}
843
844std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
845 const MachineBasicBlock::iterator &MI) const {
846 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
847
848 if (!(MI->mayLoad() && MI->mayStore()))
849 return std::nullopt;
850
851 // Be conservative if there are no memory operands.
852 if (MI->getNumMemOperands() == 0)
853 return SIMemOpInfo();
854
855 return constructFromMIWithMMO(MI);
856}
857
858SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
859 TII = ST.getInstrInfo();
860 IV = getIsaVersion(ST.getCPU());
861 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
862}
863
864bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
865 AMDGPU::CPol::CPol Bit) const {
866 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
867 if (!CPol)
868 return false;
869
870 CPol->setImm(CPol->getImm() | Bit);
871 return true;
872}
873
874/* static */
875std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
876 GCNSubtarget::Generation Generation = ST.getGeneration();
877 if (ST.hasGFX940Insts())
878 return std::make_unique<SIGfx940CacheControl>(ST);
879 if (ST.hasGFX90AInsts())
880 return std::make_unique<SIGfx90ACacheControl>(ST);
881 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
882 return std::make_unique<SIGfx6CacheControl>(ST);
883 if (Generation < AMDGPUSubtarget::GFX10)
884 return std::make_unique<SIGfx7CacheControl>(ST);
885 if (Generation < AMDGPUSubtarget::GFX11)
886 return std::make_unique<SIGfx10CacheControl>(ST);
887 if (Generation < AMDGPUSubtarget::GFX12)
888 return std::make_unique<SIGfx11CacheControl>(ST);
889 return std::make_unique<SIGfx12CacheControl>(ST);
890}
891
892bool SIGfx6CacheControl::enableLoadCacheBypass(
894 SIAtomicScope Scope,
895 SIAtomicAddrSpace AddrSpace) const {
896 assert(MI->mayLoad() && !MI->mayStore());
897 bool Changed = false;
898
899 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
900 switch (Scope) {
901 case SIAtomicScope::SYSTEM:
902 case SIAtomicScope::AGENT:
903 // Set L1 cache policy to MISS_EVICT.
904 // Note: there is no L2 cache bypass policy at the ISA level.
905 Changed |= enableGLCBit(MI);
906 break;
907 case SIAtomicScope::WORKGROUP:
908 case SIAtomicScope::WAVEFRONT:
909 case SIAtomicScope::SINGLETHREAD:
910 // No cache to bypass.
911 break;
912 default:
913 llvm_unreachable("Unsupported synchronization scope");
914 }
915 }
916
917 /// The scratch address space does not need the global memory caches
918 /// to be bypassed as all memory operations by the same thread are
919 /// sequentially consistent, and no other thread can access scratch
920 /// memory.
921
922 /// Other address spaces do not have a cache.
923
924 return Changed;
925}
926
927bool SIGfx6CacheControl::enableStoreCacheBypass(
929 SIAtomicScope Scope,
930 SIAtomicAddrSpace AddrSpace) const {
931 assert(!MI->mayLoad() && MI->mayStore());
932 bool Changed = false;
933
934 /// The L1 cache is write through so does not need to be bypassed. There is no
935 /// bypass control for the L2 cache at the isa level.
936
937 return Changed;
938}
939
940bool SIGfx6CacheControl::enableRMWCacheBypass(
942 SIAtomicScope Scope,
943 SIAtomicAddrSpace AddrSpace) const {
944 assert(MI->mayLoad() && MI->mayStore());
945 bool Changed = false;
946
947 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
948 /// bypassed, and the GLC bit is instead used to indicate if they are
949 /// return or no-return.
950 /// Note: there is no L2 cache coherent bypass control at the ISA level.
951
952 return Changed;
953}
954
955bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
956 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
957 bool IsVolatile, bool IsNonTemporal) const {
958 // Only handle load and store, not atomic read-modify-write insructions. The
959 // latter use glc to indicate if the atomic returns a result and so must not
960 // be used for cache control.
961 assert(MI->mayLoad() ^ MI->mayStore());
962
963 // Only update load and store, not LLVM IR atomic read-modify-write
964 // instructions. The latter are always marked as volatile so cannot sensibly
965 // handle it as do not want to pessimize all atomics. Also they do not support
966 // the nontemporal attribute.
967 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
968
969 bool Changed = false;
970
971 if (IsVolatile) {
972 // Set L1 cache policy to be MISS_EVICT for load instructions
973 // and MISS_LRU for store instructions.
974 // Note: there is no L2 cache bypass policy at the ISA level.
975 if (Op == SIMemOp::LOAD)
976 Changed |= enableGLCBit(MI);
977
978 // Ensure operation has completed at system scope to cause all volatile
979 // operations to be visible outside the program in a global order. Do not
980 // request cross address space as only the global address space can be
981 // observable outside the program, so no need to cause a waitcnt for LDS
982 // address space operations.
983 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
984 Position::AFTER);
985
986 return Changed;
987 }
988
989 if (IsNonTemporal) {
990 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
991 // for both loads and stores, and the L2 cache policy to STREAM.
992 Changed |= enableGLCBit(MI);
993 Changed |= enableSLCBit(MI);
994 return Changed;
995 }
996
997 return Changed;
998}
999
1000bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1001 SIAtomicScope Scope,
1002 SIAtomicAddrSpace AddrSpace,
1003 SIMemOp Op,
1004 bool IsCrossAddrSpaceOrdering,
1005 Position Pos) const {
1006 bool Changed = false;
1007
1008 MachineBasicBlock &MBB = *MI->getParent();
1009 DebugLoc DL = MI->getDebugLoc();
1010
1011 if (Pos == Position::AFTER)
1012 ++MI;
1013
1014 bool VMCnt = false;
1015 bool LGKMCnt = false;
1016
1017 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1018 SIAtomicAddrSpace::NONE) {
1019 switch (Scope) {
1020 case SIAtomicScope::SYSTEM:
1021 case SIAtomicScope::AGENT:
1022 VMCnt |= true;
1023 break;
1024 case SIAtomicScope::WORKGROUP:
1025 case SIAtomicScope::WAVEFRONT:
1026 case SIAtomicScope::SINGLETHREAD:
1027 // The L1 cache keeps all memory operations in order for
1028 // wavefronts in the same work-group.
1029 break;
1030 default:
1031 llvm_unreachable("Unsupported synchronization scope");
1032 }
1033 }
1034
1035 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1036 switch (Scope) {
1037 case SIAtomicScope::SYSTEM:
1038 case SIAtomicScope::AGENT:
1039 case SIAtomicScope::WORKGROUP:
1040 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1041 // not needed as LDS operations for all waves are executed in a total
1042 // global ordering as observed by all waves. Required if also
1043 // synchronizing with global/GDS memory as LDS operations could be
1044 // reordered with respect to later global/GDS memory operations of the
1045 // same wave.
1046 LGKMCnt |= IsCrossAddrSpaceOrdering;
1047 break;
1048 case SIAtomicScope::WAVEFRONT:
1049 case SIAtomicScope::SINGLETHREAD:
1050 // The LDS keeps all memory operations in order for
1051 // the same wavefront.
1052 break;
1053 default:
1054 llvm_unreachable("Unsupported synchronization scope");
1055 }
1056 }
1057
1058 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1059 switch (Scope) {
1060 case SIAtomicScope::SYSTEM:
1061 case SIAtomicScope::AGENT:
1062 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1063 // is not needed as GDS operations for all waves are executed in a total
1064 // global ordering as observed by all waves. Required if also
1065 // synchronizing with global/LDS memory as GDS operations could be
1066 // reordered with respect to later global/LDS memory operations of the
1067 // same wave.
1068 LGKMCnt |= IsCrossAddrSpaceOrdering;
1069 break;
1070 case SIAtomicScope::WORKGROUP:
1071 case SIAtomicScope::WAVEFRONT:
1072 case SIAtomicScope::SINGLETHREAD:
1073 // The GDS keeps all memory operations in order for
1074 // the same work-group.
1075 break;
1076 default:
1077 llvm_unreachable("Unsupported synchronization scope");
1078 }
1079 }
1080
1081 if (VMCnt || LGKMCnt) {
1082 unsigned WaitCntImmediate =
1084 VMCnt ? 0 : getVmcntBitMask(IV),
1086 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1088 .addImm(WaitCntImmediate);
1089 Changed = true;
1090 }
1091
1092 if (Pos == Position::AFTER)
1093 --MI;
1094
1095 return Changed;
1096}
1097
1098bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1099 SIAtomicScope Scope,
1100 SIAtomicAddrSpace AddrSpace,
1101 Position Pos) const {
1102 if (!InsertCacheInv)
1103 return false;
1104
1105 bool Changed = false;
1106
1107 MachineBasicBlock &MBB = *MI->getParent();
1108 DebugLoc DL = MI->getDebugLoc();
1109
1110 if (Pos == Position::AFTER)
1111 ++MI;
1112
1113 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1114 switch (Scope) {
1115 case SIAtomicScope::SYSTEM:
1116 case SIAtomicScope::AGENT:
1117 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1118 Changed = true;
1119 break;
1120 case SIAtomicScope::WORKGROUP:
1121 case SIAtomicScope::WAVEFRONT:
1122 case SIAtomicScope::SINGLETHREAD:
1123 // No cache to invalidate.
1124 break;
1125 default:
1126 llvm_unreachable("Unsupported synchronization scope");
1127 }
1128 }
1129
1130 /// The scratch address space does not need the global memory cache
1131 /// to be flushed as all memory operations by the same thread are
1132 /// sequentially consistent, and no other thread can access scratch
1133 /// memory.
1134
1135 /// Other address spaces do not have a cache.
1136
1137 if (Pos == Position::AFTER)
1138 --MI;
1139
1140 return Changed;
1141}
1142
1143bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1144 SIAtomicScope Scope,
1145 SIAtomicAddrSpace AddrSpace,
1146 bool IsCrossAddrSpaceOrdering,
1147 Position Pos) const {
1148 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1149 IsCrossAddrSpaceOrdering, Pos);
1150}
1151
1152bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1153 SIAtomicScope Scope,
1154 SIAtomicAddrSpace AddrSpace,
1155 Position Pos) const {
1156 if (!InsertCacheInv)
1157 return false;
1158
1159 bool Changed = false;
1160
1161 MachineBasicBlock &MBB = *MI->getParent();
1162 DebugLoc DL = MI->getDebugLoc();
1163
1165
1166 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1167 ? AMDGPU::BUFFER_WBINVL1
1168 : AMDGPU::BUFFER_WBINVL1_VOL;
1169
1170 if (Pos == Position::AFTER)
1171 ++MI;
1172
1173 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174 switch (Scope) {
1175 case SIAtomicScope::SYSTEM:
1176 case SIAtomicScope::AGENT:
1177 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1178 Changed = true;
1179 break;
1180 case SIAtomicScope::WORKGROUP:
1181 case SIAtomicScope::WAVEFRONT:
1182 case SIAtomicScope::SINGLETHREAD:
1183 // No cache to invalidate.
1184 break;
1185 default:
1186 llvm_unreachable("Unsupported synchronization scope");
1187 }
1188 }
1189
1190 /// The scratch address space does not need the global memory cache
1191 /// to be flushed as all memory operations by the same thread are
1192 /// sequentially consistent, and no other thread can access scratch
1193 /// memory.
1194
1195 /// Other address spaces do not have a cache.
1196
1197 if (Pos == Position::AFTER)
1198 --MI;
1199
1200 return Changed;
1201}
1202
1203bool SIGfx90ACacheControl::enableLoadCacheBypass(
1205 SIAtomicScope Scope,
1206 SIAtomicAddrSpace AddrSpace) const {
1207 assert(MI->mayLoad() && !MI->mayStore());
1208 bool Changed = false;
1209
1210 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1211 switch (Scope) {
1212 case SIAtomicScope::SYSTEM:
1213 case SIAtomicScope::AGENT:
1214 // Set the L1 cache policy to MISS_LRU.
1215 // Note: there is no L2 cache bypass policy at the ISA level.
1216 Changed |= enableGLCBit(MI);
1217 break;
1218 case SIAtomicScope::WORKGROUP:
1219 // In threadgroup split mode the waves of a work-group can be executing on
1220 // different CUs. Therefore need to bypass the L1 which is per CU.
1221 // Otherwise in non-threadgroup split mode all waves of a work-group are
1222 // on the same CU, and so the L1 does not need to be bypassed.
1223 if (ST.isTgSplitEnabled())
1224 Changed |= enableGLCBit(MI);
1225 break;
1226 case SIAtomicScope::WAVEFRONT:
1227 case SIAtomicScope::SINGLETHREAD:
1228 // No cache to bypass.
1229 break;
1230 default:
1231 llvm_unreachable("Unsupported synchronization scope");
1232 }
1233 }
1234
1235 /// The scratch address space does not need the global memory caches
1236 /// to be bypassed as all memory operations by the same thread are
1237 /// sequentially consistent, and no other thread can access scratch
1238 /// memory.
1239
1240 /// Other address spaces do not have a cache.
1241
1242 return Changed;
1243}
1244
1245bool SIGfx90ACacheControl::enableStoreCacheBypass(
1247 SIAtomicScope Scope,
1248 SIAtomicAddrSpace AddrSpace) const {
1249 assert(!MI->mayLoad() && MI->mayStore());
1250 bool Changed = false;
1251
1252 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1253 switch (Scope) {
1254 case SIAtomicScope::SYSTEM:
1255 case SIAtomicScope::AGENT:
1256 /// Do not set glc for store atomic operations as they implicitly write
1257 /// through the L1 cache.
1258 break;
1259 case SIAtomicScope::WORKGROUP:
1260 case SIAtomicScope::WAVEFRONT:
1261 case SIAtomicScope::SINGLETHREAD:
1262 // No cache to bypass. Store atomics implicitly write through the L1
1263 // cache.
1264 break;
1265 default:
1266 llvm_unreachable("Unsupported synchronization scope");
1267 }
1268 }
1269
1270 /// The scratch address space does not need the global memory caches
1271 /// to be bypassed as all memory operations by the same thread are
1272 /// sequentially consistent, and no other thread can access scratch
1273 /// memory.
1274
1275 /// Other address spaces do not have a cache.
1276
1277 return Changed;
1278}
1279
1280bool SIGfx90ACacheControl::enableRMWCacheBypass(
1282 SIAtomicScope Scope,
1283 SIAtomicAddrSpace AddrSpace) const {
1284 assert(MI->mayLoad() && MI->mayStore());
1285 bool Changed = false;
1286
1287 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1288 switch (Scope) {
1289 case SIAtomicScope::SYSTEM:
1290 case SIAtomicScope::AGENT:
1291 /// Do not set glc for RMW atomic operations as they implicitly bypass
1292 /// the L1 cache, and the glc bit is instead used to indicate if they are
1293 /// return or no-return.
1294 break;
1295 case SIAtomicScope::WORKGROUP:
1296 case SIAtomicScope::WAVEFRONT:
1297 case SIAtomicScope::SINGLETHREAD:
1298 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1299 break;
1300 default:
1301 llvm_unreachable("Unsupported synchronization scope");
1302 }
1303 }
1304
1305 return Changed;
1306}
1307
1308bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1309 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1310 bool IsVolatile, bool IsNonTemporal) const {
1311 // Only handle load and store, not atomic read-modify-write insructions. The
1312 // latter use glc to indicate if the atomic returns a result and so must not
1313 // be used for cache control.
1314 assert(MI->mayLoad() ^ MI->mayStore());
1315
1316 // Only update load and store, not LLVM IR atomic read-modify-write
1317 // instructions. The latter are always marked as volatile so cannot sensibly
1318 // handle it as do not want to pessimize all atomics. Also they do not support
1319 // the nontemporal attribute.
1320 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1321
1322 bool Changed = false;
1323
1324 if (IsVolatile) {
1325 // Set L1 cache policy to be MISS_EVICT for load instructions
1326 // and MISS_LRU for store instructions.
1327 // Note: there is no L2 cache bypass policy at the ISA level.
1328 if (Op == SIMemOp::LOAD)
1329 Changed |= enableGLCBit(MI);
1330
1331 // Ensure operation has completed at system scope to cause all volatile
1332 // operations to be visible outside the program in a global order. Do not
1333 // request cross address space as only the global address space can be
1334 // observable outside the program, so no need to cause a waitcnt for LDS
1335 // address space operations.
1336 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1337 Position::AFTER);
1338
1339 return Changed;
1340 }
1341
1342 if (IsNonTemporal) {
1343 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1344 // for both loads and stores, and the L2 cache policy to STREAM.
1345 Changed |= enableGLCBit(MI);
1346 Changed |= enableSLCBit(MI);
1347 return Changed;
1348 }
1349
1350 return Changed;
1351}
1352
1353bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1354 SIAtomicScope Scope,
1355 SIAtomicAddrSpace AddrSpace,
1356 SIMemOp Op,
1357 bool IsCrossAddrSpaceOrdering,
1358 Position Pos) const {
1359 if (ST.isTgSplitEnabled()) {
1360 // In threadgroup split mode the waves of a work-group can be executing on
1361 // different CUs. Therefore need to wait for global or GDS memory operations
1362 // to complete to ensure they are visible to waves in the other CUs.
1363 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1364 // the same CU, so no need to wait for global memory as all waves in the
1365 // work-group access the same the L1, nor wait for GDS as access are ordered
1366 // on a CU.
1367 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1368 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1369 (Scope == SIAtomicScope::WORKGROUP)) {
1370 // Same as GFX7 using agent scope.
1371 Scope = SIAtomicScope::AGENT;
1372 }
1373 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1374 // LDS memory operations.
1375 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1376 }
1377 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1378 IsCrossAddrSpaceOrdering, Pos);
1379}
1380
1381bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1382 SIAtomicScope Scope,
1383 SIAtomicAddrSpace AddrSpace,
1384 Position Pos) const {
1385 if (!InsertCacheInv)
1386 return false;
1387
1388 bool Changed = false;
1389
1390 MachineBasicBlock &MBB = *MI->getParent();
1391 DebugLoc DL = MI->getDebugLoc();
1392
1393 if (Pos == Position::AFTER)
1394 ++MI;
1395
1396 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1397 switch (Scope) {
1398 case SIAtomicScope::SYSTEM:
1399 // Ensures that following loads will not see stale remote VMEM data or
1400 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1401 // CC will never be stale due to the local memory probes.
1402 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1403 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1404 // hardware does not reorder memory operations by the same wave with
1405 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1406 // remove any cache lines of earlier writes by the same wave and ensures
1407 // later reads by the same wave will refetch the cache lines.
1408 Changed = true;
1409 break;
1410 case SIAtomicScope::AGENT:
1411 // Same as GFX7.
1412 break;
1413 case SIAtomicScope::WORKGROUP:
1414 // In threadgroup split mode the waves of a work-group can be executing on
1415 // different CUs. Therefore need to invalidate the L1 which is per CU.
1416 // Otherwise in non-threadgroup split mode all waves of a work-group are
1417 // on the same CU, and so the L1 does not need to be invalidated.
1418 if (ST.isTgSplitEnabled()) {
1419 // Same as GFX7 using agent scope.
1420 Scope = SIAtomicScope::AGENT;
1421 }
1422 break;
1423 case SIAtomicScope::WAVEFRONT:
1424 case SIAtomicScope::SINGLETHREAD:
1425 // Same as GFX7.
1426 break;
1427 default:
1428 llvm_unreachable("Unsupported synchronization scope");
1429 }
1430 }
1431
1432 /// The scratch address space does not need the global memory cache
1433 /// to be flushed as all memory operations by the same thread are
1434 /// sequentially consistent, and no other thread can access scratch
1435 /// memory.
1436
1437 /// Other address spaces do not have a cache.
1438
1439 if (Pos == Position::AFTER)
1440 --MI;
1441
1442 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1443
1444 return Changed;
1445}
1446
1447bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1448 SIAtomicScope Scope,
1449 SIAtomicAddrSpace AddrSpace,
1450 bool IsCrossAddrSpaceOrdering,
1451 Position Pos) const {
1452 bool Changed = false;
1453
1454 MachineBasicBlock &MBB = *MI->getParent();
1455 const DebugLoc &DL = MI->getDebugLoc();
1456
1457 if (Pos == Position::AFTER)
1458 ++MI;
1459
1460 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1461 switch (Scope) {
1462 case SIAtomicScope::SYSTEM:
1463 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1464 // hardware does not reorder memory operations by the same wave with
1465 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1466 // to initiate writeback of any dirty cache lines of earlier writes by the
1467 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1468 // writeback has completed.
1469 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1470 // Set SC bits to indicate system scope.
1472 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1473 // vmcnt(0)" needed by the "BUFFER_WBL2".
1474 Changed = true;
1475 break;
1476 case SIAtomicScope::AGENT:
1477 case SIAtomicScope::WORKGROUP:
1478 case SIAtomicScope::WAVEFRONT:
1479 case SIAtomicScope::SINGLETHREAD:
1480 // Same as GFX7.
1481 break;
1482 default:
1483 llvm_unreachable("Unsupported synchronization scope");
1484 }
1485 }
1486
1487 if (Pos == Position::AFTER)
1488 --MI;
1489
1490 Changed |=
1491 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1492 IsCrossAddrSpaceOrdering, Pos);
1493
1494 return Changed;
1495}
1496
1497bool SIGfx940CacheControl::enableLoadCacheBypass(
1498 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1499 SIAtomicAddrSpace AddrSpace) const {
1500 assert(MI->mayLoad() && !MI->mayStore());
1501 bool Changed = false;
1502
1503 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1504 switch (Scope) {
1505 case SIAtomicScope::SYSTEM:
1506 // Set SC bits to indicate system scope.
1507 Changed |= enableSC0Bit(MI);
1508 Changed |= enableSC1Bit(MI);
1509 break;
1510 case SIAtomicScope::AGENT:
1511 // Set SC bits to indicate agent scope.
1512 Changed |= enableSC1Bit(MI);
1513 break;
1514 case SIAtomicScope::WORKGROUP:
1515 // In threadgroup split mode the waves of a work-group can be executing on
1516 // different CUs. Therefore need to bypass the L1 which is per CU.
1517 // Otherwise in non-threadgroup split mode all waves of a work-group are
1518 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1519 // bits to indicate work-group scope will do this automatically.
1520 Changed |= enableSC0Bit(MI);
1521 break;
1522 case SIAtomicScope::WAVEFRONT:
1523 case SIAtomicScope::SINGLETHREAD:
1524 // Leave SC bits unset to indicate wavefront scope.
1525 break;
1526 default:
1527 llvm_unreachable("Unsupported synchronization scope");
1528 }
1529 }
1530
1531 /// The scratch address space does not need the global memory caches
1532 /// to be bypassed as all memory operations by the same thread are
1533 /// sequentially consistent, and no other thread can access scratch
1534 /// memory.
1535
1536 /// Other address spaces do not have a cache.
1537
1538 return Changed;
1539}
1540
1541bool SIGfx940CacheControl::enableStoreCacheBypass(
1543 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1544 assert(!MI->mayLoad() && MI->mayStore());
1545 bool Changed = false;
1546
1547 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1548 switch (Scope) {
1549 case SIAtomicScope::SYSTEM:
1550 // Set SC bits to indicate system scope.
1551 Changed |= enableSC0Bit(MI);
1552 Changed |= enableSC1Bit(MI);
1553 break;
1554 case SIAtomicScope::AGENT:
1555 // Set SC bits to indicate agent scope.
1556 Changed |= enableSC1Bit(MI);
1557 break;
1558 case SIAtomicScope::WORKGROUP:
1559 // Set SC bits to indicate workgroup scope.
1560 Changed |= enableSC0Bit(MI);
1561 break;
1562 case SIAtomicScope::WAVEFRONT:
1563 case SIAtomicScope::SINGLETHREAD:
1564 // Leave SC bits unset to indicate wavefront scope.
1565 break;
1566 default:
1567 llvm_unreachable("Unsupported synchronization scope");
1568 }
1569 }
1570
1571 /// The scratch address space does not need the global memory caches
1572 /// to be bypassed as all memory operations by the same thread are
1573 /// sequentially consistent, and no other thread can access scratch
1574 /// memory.
1575
1576 /// Other address spaces do not have a cache.
1577
1578 return Changed;
1579}
1580
1581bool SIGfx940CacheControl::enableRMWCacheBypass(
1582 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1583 SIAtomicAddrSpace AddrSpace) const {
1584 assert(MI->mayLoad() && MI->mayStore());
1585 bool Changed = false;
1586
1587 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1588 switch (Scope) {
1589 case SIAtomicScope::SYSTEM:
1590 // Set SC1 bit to indicate system scope.
1591 Changed |= enableSC1Bit(MI);
1592 break;
1593 case SIAtomicScope::AGENT:
1594 case SIAtomicScope::WORKGROUP:
1595 case SIAtomicScope::WAVEFRONT:
1596 case SIAtomicScope::SINGLETHREAD:
1597 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1598 // to indicate system or agent scope. The SC0 bit is used to indicate if
1599 // they are return or no-return. Leave SC1 bit unset to indicate agent
1600 // scope.
1601 break;
1602 default:
1603 llvm_unreachable("Unsupported synchronization scope");
1604 }
1605 }
1606
1607 return Changed;
1608}
1609
1610bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1611 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1612 bool IsVolatile, bool IsNonTemporal) const {
1613 // Only handle load and store, not atomic read-modify-write insructions. The
1614 // latter use glc to indicate if the atomic returns a result and so must not
1615 // be used for cache control.
1616 assert(MI->mayLoad() ^ MI->mayStore());
1617
1618 // Only update load and store, not LLVM IR atomic read-modify-write
1619 // instructions. The latter are always marked as volatile so cannot sensibly
1620 // handle it as do not want to pessimize all atomics. Also they do not support
1621 // the nontemporal attribute.
1622 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1623
1624 bool Changed = false;
1625
1626 if (IsVolatile) {
1627 // Set SC bits to indicate system scope.
1628 Changed |= enableSC0Bit(MI);
1629 Changed |= enableSC1Bit(MI);
1630
1631 // Ensure operation has completed at system scope to cause all volatile
1632 // operations to be visible outside the program in a global order. Do not
1633 // request cross address space as only the global address space can be
1634 // observable outside the program, so no need to cause a waitcnt for LDS
1635 // address space operations.
1636 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1637 Position::AFTER);
1638
1639 return Changed;
1640 }
1641
1642 if (IsNonTemporal) {
1643 Changed |= enableNTBit(MI);
1644 return Changed;
1645 }
1646
1647 return Changed;
1648}
1649
1650bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1651 SIAtomicScope Scope,
1652 SIAtomicAddrSpace AddrSpace,
1653 Position Pos) const {
1654 if (!InsertCacheInv)
1655 return false;
1656
1657 bool Changed = false;
1658
1659 MachineBasicBlock &MBB = *MI->getParent();
1660 DebugLoc DL = MI->getDebugLoc();
1661
1662 if (Pos == Position::AFTER)
1663 ++MI;
1664
1665 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1666 switch (Scope) {
1667 case SIAtomicScope::SYSTEM:
1668 // Ensures that following loads will not see stale remote VMEM data or
1669 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1670 // CC will never be stale due to the local memory probes.
1671 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1672 // Set SC bits to indicate system scope.
1674 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1675 // hardware does not reorder memory operations by the same wave with
1676 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1677 // remove any cache lines of earlier writes by the same wave and ensures
1678 // later reads by the same wave will refetch the cache lines.
1679 Changed = true;
1680 break;
1681 case SIAtomicScope::AGENT:
1682 // Ensures that following loads will not see stale remote date or local
1683 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1684 // due to the memory probes.
1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1686 // Set SC bits to indicate agent scope.
1688 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1689 // does not reorder memory operations with respect to preceeding buffer
1690 // invalidate. The invalidate is guaranteed to remove any cache lines of
1691 // earlier writes and ensures later writes will refetch the cache lines.
1692 Changed = true;
1693 break;
1694 case SIAtomicScope::WORKGROUP:
1695 // In threadgroup split mode the waves of a work-group can be executing on
1696 // different CUs. Therefore need to invalidate the L1 which is per CU.
1697 // Otherwise in non-threadgroup split mode all waves of a work-group are
1698 // on the same CU, and so the L1 does not need to be invalidated.
1699 if (ST.isTgSplitEnabled()) {
1700 // Ensures L1 is invalidated if in threadgroup split mode. In
1701 // non-threadgroup split mode it is a NOP, but no point generating it in
1702 // that case if know not in that mode.
1703 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1704 // Set SC bits to indicate work-group scope.
1706 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1707 // does not reorder memory operations with respect to preceeding buffer
1708 // invalidate. The invalidate is guaranteed to remove any cache lines of
1709 // earlier writes and ensures later writes will refetch the cache lines.
1710 Changed = true;
1711 }
1712 break;
1713 case SIAtomicScope::WAVEFRONT:
1714 case SIAtomicScope::SINGLETHREAD:
1715 // Could generate "BUFFER_INV" but it would do nothing as there are no
1716 // caches to invalidate.
1717 break;
1718 default:
1719 llvm_unreachable("Unsupported synchronization scope");
1720 }
1721 }
1722
1723 /// The scratch address space does not need the global memory cache
1724 /// to be flushed as all memory operations by the same thread are
1725 /// sequentially consistent, and no other thread can access scratch
1726 /// memory.
1727
1728 /// Other address spaces do not have a cache.
1729
1730 if (Pos == Position::AFTER)
1731 --MI;
1732
1733 return Changed;
1734}
1735
1736bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1737 SIAtomicScope Scope,
1738 SIAtomicAddrSpace AddrSpace,
1739 bool IsCrossAddrSpaceOrdering,
1740 Position Pos) const {
1741 bool Changed = false;
1742
1743 MachineBasicBlock &MBB = *MI->getParent();
1744 DebugLoc DL = MI->getDebugLoc();
1745
1746 if (Pos == Position::AFTER)
1747 ++MI;
1748
1749 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1750 switch (Scope) {
1751 case SIAtomicScope::SYSTEM:
1752 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1753 // hardware does not reorder memory operations by the same wave with
1754 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1755 // to initiate writeback of any dirty cache lines of earlier writes by the
1756 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1757 // writeback has completed.
1758 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1759 // Set SC bits to indicate system scope.
1761 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1762 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1763 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1764 Changed = true;
1765 break;
1766 case SIAtomicScope::AGENT:
1767 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1768 // Set SC bits to indicate agent scope.
1770
1771 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1772 // SIAtomicScope::AGENT, the following insertWait will generate the
1773 // required "S_WAITCNT vmcnt(0)".
1774 Changed = true;
1775 break;
1776 case SIAtomicScope::WORKGROUP:
1777 case SIAtomicScope::WAVEFRONT:
1778 case SIAtomicScope::SINGLETHREAD:
1779 // Do not generate "BUFFER_WBL2" as there are no caches it would
1780 // writeback, and would require an otherwise unnecessary
1781 // "S_WAITCNT vmcnt(0)".
1782 break;
1783 default:
1784 llvm_unreachable("Unsupported synchronization scope");
1785 }
1786 }
1787
1788 if (Pos == Position::AFTER)
1789 --MI;
1790
1791 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1792 // S_WAITCNT needed.
1793 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1794 IsCrossAddrSpaceOrdering, Pos);
1795
1796 return Changed;
1797}
1798
1799bool SIGfx10CacheControl::enableLoadCacheBypass(
1801 SIAtomicScope Scope,
1802 SIAtomicAddrSpace AddrSpace) const {
1803 assert(MI->mayLoad() && !MI->mayStore());
1804 bool Changed = false;
1805
1806 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1807 switch (Scope) {
1808 case SIAtomicScope::SYSTEM:
1809 case SIAtomicScope::AGENT:
1810 // Set the L0 and L1 cache policies to MISS_EVICT.
1811 // Note: there is no L2 cache coherent bypass control at the ISA level.
1812 Changed |= enableGLCBit(MI);
1813 Changed |= enableDLCBit(MI);
1814 break;
1815 case SIAtomicScope::WORKGROUP:
1816 // In WGP mode the waves of a work-group can be executing on either CU of
1817 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1818 // CU mode all waves of a work-group are on the same CU, and so the L0
1819 // does not need to be bypassed.
1820 if (!ST.isCuModeEnabled())
1821 Changed |= enableGLCBit(MI);
1822 break;
1823 case SIAtomicScope::WAVEFRONT:
1824 case SIAtomicScope::SINGLETHREAD:
1825 // No cache to bypass.
1826 break;
1827 default:
1828 llvm_unreachable("Unsupported synchronization scope");
1829 }
1830 }
1831
1832 /// The scratch address space does not need the global memory caches
1833 /// to be bypassed as all memory operations by the same thread are
1834 /// sequentially consistent, and no other thread can access scratch
1835 /// memory.
1836
1837 /// Other address spaces do not have a cache.
1838
1839 return Changed;
1840}
1841
1842bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1843 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1844 bool IsVolatile, bool IsNonTemporal) const {
1845
1846 // Only handle load and store, not atomic read-modify-write insructions. The
1847 // latter use glc to indicate if the atomic returns a result and so must not
1848 // be used for cache control.
1849 assert(MI->mayLoad() ^ MI->mayStore());
1850
1851 // Only update load and store, not LLVM IR atomic read-modify-write
1852 // instructions. The latter are always marked as volatile so cannot sensibly
1853 // handle it as do not want to pessimize all atomics. Also they do not support
1854 // the nontemporal attribute.
1855 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1856
1857 bool Changed = false;
1858
1859 if (IsVolatile) {
1860 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1861 // and MISS_LRU for store instructions.
1862 // Note: there is no L2 cache coherent bypass control at the ISA level.
1863 if (Op == SIMemOp::LOAD) {
1864 Changed |= enableGLCBit(MI);
1865 Changed |= enableDLCBit(MI);
1866 }
1867
1868 // Ensure operation has completed at system scope to cause all volatile
1869 // operations to be visible outside the program in a global order. Do not
1870 // request cross address space as only the global address space can be
1871 // observable outside the program, so no need to cause a waitcnt for LDS
1872 // address space operations.
1873 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1874 Position::AFTER);
1875 return Changed;
1876 }
1877
1878 if (IsNonTemporal) {
1879 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1880 // and L2 cache policy to STREAM.
1881 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1882 // to MISS_EVICT and the L2 cache policy to STREAM.
1883 if (Op == SIMemOp::STORE)
1884 Changed |= enableGLCBit(MI);
1885 Changed |= enableSLCBit(MI);
1886
1887 return Changed;
1888 }
1889
1890 return Changed;
1891}
1892
1893bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1894 SIAtomicScope Scope,
1895 SIAtomicAddrSpace AddrSpace,
1896 SIMemOp Op,
1897 bool IsCrossAddrSpaceOrdering,
1898 Position Pos) const {
1899 bool Changed = false;
1900
1901 MachineBasicBlock &MBB = *MI->getParent();
1902 DebugLoc DL = MI->getDebugLoc();
1903
1904 if (Pos == Position::AFTER)
1905 ++MI;
1906
1907 bool VMCnt = false;
1908 bool VSCnt = false;
1909 bool LGKMCnt = false;
1910
1911 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1912 SIAtomicAddrSpace::NONE) {
1913 switch (Scope) {
1914 case SIAtomicScope::SYSTEM:
1915 case SIAtomicScope::AGENT:
1916 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1917 VMCnt |= true;
1918 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1919 VSCnt |= true;
1920 break;
1921 case SIAtomicScope::WORKGROUP:
1922 // In WGP mode the waves of a work-group can be executing on either CU of
1923 // the WGP. Therefore need to wait for operations to complete to ensure
1924 // they are visible to waves in the other CU as the L0 is per CU.
1925 // Otherwise in CU mode and all waves of a work-group are on the same CU
1926 // which shares the same L0.
1927 if (!ST.isCuModeEnabled()) {
1928 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1929 VMCnt |= true;
1930 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1931 VSCnt |= true;
1932 }
1933 break;
1934 case SIAtomicScope::WAVEFRONT:
1935 case SIAtomicScope::SINGLETHREAD:
1936 // The L0 cache keeps all memory operations in order for
1937 // work-items in the same wavefront.
1938 break;
1939 default:
1940 llvm_unreachable("Unsupported synchronization scope");
1941 }
1942 }
1943
1944 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1945 switch (Scope) {
1946 case SIAtomicScope::SYSTEM:
1947 case SIAtomicScope::AGENT:
1948 case SIAtomicScope::WORKGROUP:
1949 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1950 // not needed as LDS operations for all waves are executed in a total
1951 // global ordering as observed by all waves. Required if also
1952 // synchronizing with global/GDS memory as LDS operations could be
1953 // reordered with respect to later global/GDS memory operations of the
1954 // same wave.
1955 LGKMCnt |= IsCrossAddrSpaceOrdering;
1956 break;
1957 case SIAtomicScope::WAVEFRONT:
1958 case SIAtomicScope::SINGLETHREAD:
1959 // The LDS keeps all memory operations in order for
1960 // the same wavefront.
1961 break;
1962 default:
1963 llvm_unreachable("Unsupported synchronization scope");
1964 }
1965 }
1966
1967 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1968 switch (Scope) {
1969 case SIAtomicScope::SYSTEM:
1970 case SIAtomicScope::AGENT:
1971 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1972 // is not needed as GDS operations for all waves are executed in a total
1973 // global ordering as observed by all waves. Required if also
1974 // synchronizing with global/LDS memory as GDS operations could be
1975 // reordered with respect to later global/LDS memory operations of the
1976 // same wave.
1977 LGKMCnt |= IsCrossAddrSpaceOrdering;
1978 break;
1979 case SIAtomicScope::WORKGROUP:
1980 case SIAtomicScope::WAVEFRONT:
1981 case SIAtomicScope::SINGLETHREAD:
1982 // The GDS keeps all memory operations in order for
1983 // the same work-group.
1984 break;
1985 default:
1986 llvm_unreachable("Unsupported synchronization scope");
1987 }
1988 }
1989
1990 if (VMCnt || LGKMCnt) {
1991 unsigned WaitCntImmediate =
1993 VMCnt ? 0 : getVmcntBitMask(IV),
1995 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1997 .addImm(WaitCntImmediate);
1998 Changed = true;
1999 }
2000
2001 if (VSCnt) {
2002 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2003 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2004 .addImm(0);
2005 Changed = true;
2006 }
2007
2008 if (Pos == Position::AFTER)
2009 --MI;
2010
2011 return Changed;
2012}
2013
2014bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2015 SIAtomicScope Scope,
2016 SIAtomicAddrSpace AddrSpace,
2017 Position Pos) const {
2018 if (!InsertCacheInv)
2019 return false;
2020
2021 bool Changed = false;
2022
2023 MachineBasicBlock &MBB = *MI->getParent();
2024 DebugLoc DL = MI->getDebugLoc();
2025
2026 if (Pos == Position::AFTER)
2027 ++MI;
2028
2029 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2030 switch (Scope) {
2031 case SIAtomicScope::SYSTEM:
2032 case SIAtomicScope::AGENT:
2033 // The order of invalidates matter here. We must invalidate "outer in"
2034 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2035 // invalidated.
2036 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2037 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2038 Changed = true;
2039 break;
2040 case SIAtomicScope::WORKGROUP:
2041 // In WGP mode the waves of a work-group can be executing on either CU of
2042 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2043 // in CU mode and all waves of a work-group are on the same CU, and so the
2044 // L0 does not need to be invalidated.
2045 if (!ST.isCuModeEnabled()) {
2046 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2047 Changed = true;
2048 }
2049 break;
2050 case SIAtomicScope::WAVEFRONT:
2051 case SIAtomicScope::SINGLETHREAD:
2052 // No cache to invalidate.
2053 break;
2054 default:
2055 llvm_unreachable("Unsupported synchronization scope");
2056 }
2057 }
2058
2059 /// The scratch address space does not need the global memory cache
2060 /// to be flushed as all memory operations by the same thread are
2061 /// sequentially consistent, and no other thread can access scratch
2062 /// memory.
2063
2064 /// Other address spaces do not have a cache.
2065
2066 if (Pos == Position::AFTER)
2067 --MI;
2068
2069 return Changed;
2070}
2071
2072bool SIGfx11CacheControl::enableLoadCacheBypass(
2073 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2074 SIAtomicAddrSpace AddrSpace) const {
2075 assert(MI->mayLoad() && !MI->mayStore());
2076 bool Changed = false;
2077
2078 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2079 switch (Scope) {
2080 case SIAtomicScope::SYSTEM:
2081 case SIAtomicScope::AGENT:
2082 // Set the L0 and L1 cache policies to MISS_EVICT.
2083 // Note: there is no L2 cache coherent bypass control at the ISA level.
2084 Changed |= enableGLCBit(MI);
2085 break;
2086 case SIAtomicScope::WORKGROUP:
2087 // In WGP mode the waves of a work-group can be executing on either CU of
2088 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2089 // CU mode all waves of a work-group are on the same CU, and so the L0
2090 // does not need to be bypassed.
2091 if (!ST.isCuModeEnabled())
2092 Changed |= enableGLCBit(MI);
2093 break;
2094 case SIAtomicScope::WAVEFRONT:
2095 case SIAtomicScope::SINGLETHREAD:
2096 // No cache to bypass.
2097 break;
2098 default:
2099 llvm_unreachable("Unsupported synchronization scope");
2100 }
2101 }
2102
2103 /// The scratch address space does not need the global memory caches
2104 /// to be bypassed as all memory operations by the same thread are
2105 /// sequentially consistent, and no other thread can access scratch
2106 /// memory.
2107
2108 /// Other address spaces do not have a cache.
2109
2110 return Changed;
2111}
2112
2113bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2114 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2115 bool IsVolatile, bool IsNonTemporal) const {
2116
2117 // Only handle load and store, not atomic read-modify-write insructions. The
2118 // latter use glc to indicate if the atomic returns a result and so must not
2119 // be used for cache control.
2120 assert(MI->mayLoad() ^ MI->mayStore());
2121
2122 // Only update load and store, not LLVM IR atomic read-modify-write
2123 // instructions. The latter are always marked as volatile so cannot sensibly
2124 // handle it as do not want to pessimize all atomics. Also they do not support
2125 // the nontemporal attribute.
2126 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2127
2128 bool Changed = false;
2129
2130 if (IsVolatile) {
2131 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2132 // and MISS_LRU for store instructions.
2133 // Note: there is no L2 cache coherent bypass control at the ISA level.
2134 if (Op == SIMemOp::LOAD)
2135 Changed |= enableGLCBit(MI);
2136
2137 // Set MALL NOALLOC for load and store instructions.
2138 Changed |= enableDLCBit(MI);
2139
2140 // Ensure operation has completed at system scope to cause all volatile
2141 // operations to be visible outside the program in a global order. Do not
2142 // request cross address space as only the global address space can be
2143 // observable outside the program, so no need to cause a waitcnt for LDS
2144 // address space operations.
2145 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2146 Position::AFTER);
2147 return Changed;
2148 }
2149
2150 if (IsNonTemporal) {
2151 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2152 // and L2 cache policy to STREAM.
2153 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2154 // to MISS_EVICT and the L2 cache policy to STREAM.
2155 if (Op == SIMemOp::STORE)
2156 Changed |= enableGLCBit(MI);
2157 Changed |= enableSLCBit(MI);
2158
2159 // Set MALL NOALLOC for load and store instructions.
2160 Changed |= enableDLCBit(MI);
2161 return Changed;
2162 }
2163
2164 return Changed;
2165}
2166
2167bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2168 AMDGPU::CPol::CPol Value) const {
2169 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2170 if (!CPol)
2171 return false;
2172
2174 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2175 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2176 return true;
2177 }
2178
2179 return false;
2180}
2181
2182bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2183 AMDGPU::CPol::CPol Value) const {
2184 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2185 if (!CPol)
2186 return false;
2187
2188 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2189 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2190 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2191 return true;
2192 }
2193
2194 return false;
2195}
2196
2197bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2198 SIAtomicScope Scope,
2199 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2200 bool IsCrossAddrSpaceOrdering,
2201 Position Pos) const {
2202 bool Changed = false;
2203
2204 MachineBasicBlock &MBB = *MI->getParent();
2205 DebugLoc DL = MI->getDebugLoc();
2206
2207 bool LOADCnt = false;
2208 bool DSCnt = false;
2209 bool STORECnt = false;
2210
2211 if (Pos == Position::AFTER)
2212 ++MI;
2213
2214 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2215 SIAtomicAddrSpace::NONE) {
2216 switch (Scope) {
2217 case SIAtomicScope::SYSTEM:
2218 case SIAtomicScope::AGENT:
2219 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2220 LOADCnt |= true;
2221 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2222 STORECnt |= true;
2223 break;
2224 case SIAtomicScope::WORKGROUP:
2225 // In WGP mode the waves of a work-group can be executing on either CU of
2226 // the WGP. Therefore need to wait for operations to complete to ensure
2227 // they are visible to waves in the other CU as the L0 is per CU.
2228 // Otherwise in CU mode and all waves of a work-group are on the same CU
2229 // which shares the same L0.
2230 if (!ST.isCuModeEnabled()) {
2231 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2232 LOADCnt |= true;
2233 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2234 STORECnt |= true;
2235 }
2236 break;
2237 case SIAtomicScope::WAVEFRONT:
2238 case SIAtomicScope::SINGLETHREAD:
2239 // The L0 cache keeps all memory operations in order for
2240 // work-items in the same wavefront.
2241 break;
2242 default:
2243 llvm_unreachable("Unsupported synchronization scope");
2244 }
2245 }
2246
2247 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2248 switch (Scope) {
2249 case SIAtomicScope::SYSTEM:
2250 case SIAtomicScope::AGENT:
2251 case SIAtomicScope::WORKGROUP:
2252 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2253 // not needed as LDS operations for all waves are executed in a total
2254 // global ordering as observed by all waves. Required if also
2255 // synchronizing with global/GDS memory as LDS operations could be
2256 // reordered with respect to later global/GDS memory operations of the
2257 // same wave.
2258 DSCnt |= IsCrossAddrSpaceOrdering;
2259 break;
2260 case SIAtomicScope::WAVEFRONT:
2261 case SIAtomicScope::SINGLETHREAD:
2262 // The LDS keeps all memory operations in order for
2263 // the same wavefront.
2264 break;
2265 default:
2266 llvm_unreachable("Unsupported synchronization scope");
2267 }
2268 }
2269
2270 if (LOADCnt) {
2271 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2272 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2273 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2274 Changed = true;
2275 }
2276
2277 if (STORECnt) {
2278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2279 Changed = true;
2280 }
2281
2282 if (DSCnt) {
2283 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2284 Changed = true;
2285 }
2286
2287 if (Pos == Position::AFTER)
2288 --MI;
2289
2290 return Changed;
2291}
2292
2293bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2294 SIAtomicScope Scope,
2295 SIAtomicAddrSpace AddrSpace,
2296 Position Pos) const {
2297 if (!InsertCacheInv)
2298 return false;
2299
2300 MachineBasicBlock &MBB = *MI->getParent();
2301 DebugLoc DL = MI->getDebugLoc();
2302
2303 /// The scratch address space does not need the global memory cache
2304 /// to be flushed as all memory operations by the same thread are
2305 /// sequentially consistent, and no other thread can access scratch
2306 /// memory.
2307
2308 /// Other address spaces do not have a cache.
2309 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2310 return false;
2311
2313 switch (Scope) {
2314 case SIAtomicScope::SYSTEM:
2315 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2316 break;
2317 case SIAtomicScope::AGENT:
2318 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2319 break;
2320 case SIAtomicScope::WORKGROUP:
2321 // In WGP mode the waves of a work-group can be executing on either CU of
2322 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2323 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2324 // the L0 does not need to be invalidated.
2325 if (ST.isCuModeEnabled())
2326 return false;
2327
2328 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2329 break;
2330 case SIAtomicScope::WAVEFRONT:
2331 case SIAtomicScope::SINGLETHREAD:
2332 // No cache to invalidate.
2333 return false;
2334 default:
2335 llvm_unreachable("Unsupported synchronization scope");
2336 }
2337
2338 if (Pos == Position::AFTER)
2339 ++MI;
2340
2341 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2342
2343 if (Pos == Position::AFTER)
2344 --MI;
2345
2346 return true;
2347}
2348
2349bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2350 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2351 bool IsVolatile, bool IsNonTemporal) const {
2352
2353 // Only handle load and store, not atomic read-modify-write instructions.
2354 assert(MI->mayLoad() ^ MI->mayStore());
2355
2356 // Only update load and store, not LLVM IR atomic read-modify-write
2357 // instructions. The latter are always marked as volatile so cannot sensibly
2358 // handle it as do not want to pessimize all atomics. Also they do not support
2359 // the nontemporal attribute.
2360 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2361
2362 bool Changed = false;
2363
2364 if (IsVolatile) {
2365 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2366
2367 // Ensure operation has completed at system scope to cause all volatile
2368 // operations to be visible outside the program in a global order. Do not
2369 // request cross address space as only the global address space can be
2370 // observable outside the program, so no need to cause a waitcnt for LDS
2371 // address space operations.
2372 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2373 Position::AFTER);
2374 }
2375
2376 if (IsNonTemporal) {
2377 // Set non-temporal hint for all cache levels.
2378 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2379 }
2380
2381 return Changed;
2382}
2383
2384bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2385 if (AtomicPseudoMIs.empty())
2386 return false;
2387
2388 for (auto &MI : AtomicPseudoMIs)
2389 MI->eraseFromParent();
2390
2391 AtomicPseudoMIs.clear();
2392 return true;
2393}
2394
2395bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2397 assert(MI->mayLoad() && !MI->mayStore());
2398
2399 bool Changed = false;
2400
2401 if (MOI.isAtomic()) {
2402 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2403 MOI.getOrdering() == AtomicOrdering::Acquire ||
2404 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2405 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2406 MOI.getOrderingAddrSpace());
2407 }
2408
2409 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2410 Changed |= CC->insertWait(MI, MOI.getScope(),
2411 MOI.getOrderingAddrSpace(),
2412 SIMemOp::LOAD | SIMemOp::STORE,
2413 MOI.getIsCrossAddressSpaceOrdering(),
2414 Position::BEFORE);
2415
2416 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2417 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2418 Changed |= CC->insertWait(MI, MOI.getScope(),
2419 MOI.getInstrAddrSpace(),
2420 SIMemOp::LOAD,
2421 MOI.getIsCrossAddressSpaceOrdering(),
2422 Position::AFTER);
2423 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2424 MOI.getOrderingAddrSpace(),
2425 Position::AFTER);
2426 }
2427
2428 return Changed;
2429 }
2430
2431 // Atomic instructions already bypass caches to the scope specified by the
2432 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2433 // need additional treatment.
2434 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2435 SIMemOp::LOAD, MOI.isVolatile(),
2436 MOI.isNonTemporal());
2437 return Changed;
2438}
2439
2440bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2442 assert(!MI->mayLoad() && MI->mayStore());
2443
2444 bool Changed = false;
2445
2446 if (MOI.isAtomic()) {
2447 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2448 MOI.getOrdering() == AtomicOrdering::Release ||
2449 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2450 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2451 MOI.getOrderingAddrSpace());
2452 }
2453
2454 if (MOI.getOrdering() == AtomicOrdering::Release ||
2455 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2456 Changed |= CC->insertRelease(MI, MOI.getScope(),
2457 MOI.getOrderingAddrSpace(),
2458 MOI.getIsCrossAddressSpaceOrdering(),
2459 Position::BEFORE);
2460
2461 return Changed;
2462 }
2463
2464 // Atomic instructions already bypass caches to the scope specified by the
2465 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2466 // need additional treatment.
2467 Changed |= CC->enableVolatileAndOrNonTemporal(
2468 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2469 MOI.isNonTemporal());
2470 return Changed;
2471}
2472
2473bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2475 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2476
2477 AtomicPseudoMIs.push_back(MI);
2478 bool Changed = false;
2479
2480 if (MOI.isAtomic()) {
2481 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2482 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2483 SIMemOp::LOAD | SIMemOp::STORE,
2484 MOI.getIsCrossAddressSpaceOrdering(),
2485 Position::BEFORE);
2486
2487 if (MOI.getOrdering() == AtomicOrdering::Release ||
2488 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2489 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2490 /// TODO: This relies on a barrier always generating a waitcnt
2491 /// for LDS to ensure it is not reordered with the completion of
2492 /// the proceeding LDS operations. If barrier had a memory
2493 /// ordering and memory scope, then library does not need to
2494 /// generate a fence. Could add support in this file for
2495 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2496 /// adding S_WAITCNT before a S_BARRIER.
2497 Changed |= CC->insertRelease(MI, MOI.getScope(),
2498 MOI.getOrderingAddrSpace(),
2499 MOI.getIsCrossAddressSpaceOrdering(),
2500 Position::BEFORE);
2501
2502 // TODO: If both release and invalidate are happening they could be combined
2503 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2504 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2505 // track cache invalidate and write back instructions.
2506
2507 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2508 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2509 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2510 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2511 MOI.getOrderingAddrSpace(),
2512 Position::BEFORE);
2513
2514 return Changed;
2515 }
2516
2517 return Changed;
2518}
2519
2520bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2522 assert(MI->mayLoad() && MI->mayStore());
2523
2524 bool Changed = false;
2525
2526 if (MOI.isAtomic()) {
2527 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2528 MOI.getOrdering() == AtomicOrdering::Acquire ||
2529 MOI.getOrdering() == AtomicOrdering::Release ||
2530 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2531 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2532 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2533 MOI.getInstrAddrSpace());
2534 }
2535
2536 if (MOI.getOrdering() == AtomicOrdering::Release ||
2537 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2538 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2539 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2540 Changed |= CC->insertRelease(MI, MOI.getScope(),
2541 MOI.getOrderingAddrSpace(),
2542 MOI.getIsCrossAddressSpaceOrdering(),
2543 Position::BEFORE);
2544
2545 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2546 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2547 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2548 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2549 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2550 Changed |= CC->insertWait(MI, MOI.getScope(),
2551 MOI.getInstrAddrSpace(),
2552 isAtomicRet(*MI) ? SIMemOp::LOAD :
2553 SIMemOp::STORE,
2554 MOI.getIsCrossAddressSpaceOrdering(),
2555 Position::AFTER);
2556 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2557 MOI.getOrderingAddrSpace(),
2558 Position::AFTER);
2559 }
2560
2561 return Changed;
2562 }
2563
2564 return Changed;
2565}
2566
2567bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2568 bool Changed = false;
2569
2570 SIMemOpAccess MOA(MF);
2571 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2572
2573 for (auto &MBB : MF) {
2574 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2575
2576 // Unbundle instructions after the post-RA scheduler.
2577 if (MI->isBundle() && MI->mayLoadOrStore()) {
2578 MachineBasicBlock::instr_iterator II(MI->getIterator());
2580 I != E && I->isBundledWithPred(); ++I) {
2581 I->unbundleFromPred();
2582 for (MachineOperand &MO : I->operands())
2583 if (MO.isReg())
2584 MO.setIsInternalRead(false);
2585 }
2586
2587 MI->eraseFromParent();
2588 MI = II->getIterator();
2589 }
2590
2591 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2592 continue;
2593
2594 if (const auto &MOI = MOA.getLoadInfo(MI))
2595 Changed |= expandLoad(*MOI, MI);
2596 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2597 Changed |= expandStore(*MOI, MI);
2598 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2599 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2600 Changed |= expandAtomicFence(*MOI, MI);
2601 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2602 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2603 }
2604 }
2605
2606 Changed |= removeAtomicPseudoMIs();
2607 return Changed;
2608}
2609
2610INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2611
2612char SIMemoryLegalizer::ID = 0;
2613char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2614
2616 return new SIMemoryLegalizer();
2617}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
Atomic ordering constants.
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
#define DEBUG_TYPE
static SPIRV::Scope::Scope getScope(SyncScope::ID Ord)
#define PASS_NAME
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineModuleInfo & getMMI() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
@ NONE
Definition: Attributor.h:6426
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionPass * createSIMemoryLegalizerPass()
Instruction set architecture version.
Definition: TargetParser.h:125