LLVM  17.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
20 #include "llvm/ADT/BitmaskEnum.h"
23 #include "llvm/IR/DiagnosticInfo.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
34  "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35  cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43  NONE = 0u,
44  LOAD = 1u << 0,
45  STORE = 1u << 1,
46  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52  BEFORE,
53  AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58  NONE,
59  SINGLETHREAD,
60  WAVEFRONT,
61  WORKGROUP,
62  AGENT,
63  SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69  NONE = 0u,
70  GLOBAL = 1u << 0,
71  LDS = 1u << 1,
72  SCRATCH = 1u << 2,
73  GDS = 1u << 3,
74  OTHER = 1u << 4,
75 
76  /// The address spaces that can be accessed by a FLAT instruction.
77  FLAT = GLOBAL | LDS | SCRATCH,
78 
79  /// The address spaces that support atomic instructions.
80  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82  /// All address spaces.
83  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91  friend class SIMemOpAccess;
92 
94  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98  bool IsCrossAddressSpaceOrdering = false;
99  bool IsVolatile = false;
100  bool IsNonTemporal = false;
101 
103  SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106  bool IsCrossAddressSpaceOrdering = true,
107  AtomicOrdering FailureOrdering =
109  bool IsVolatile = false,
110  bool IsNonTemporal = false)
111  : Ordering(Ordering), FailureOrdering(FailureOrdering),
112  Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113  InstrAddrSpace(InstrAddrSpace),
114  IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
116  IsNonTemporal(IsNonTemporal) {
117 
118  if (Ordering == AtomicOrdering::NotAtomic) {
120  OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121  !IsCrossAddressSpaceOrdering &&
122  FailureOrdering == AtomicOrdering::NotAtomic);
123  return;
124  }
125 
127  (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129  (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
131 
132  // There is also no cross address space ordering if the ordering
133  // address space is the same as the instruction address space and
134  // only contains a single address space.
135  if ((OrderingAddrSpace == InstrAddrSpace) &&
136  isPowerOf2_32(uint32_t(InstrAddrSpace)))
137  this->IsCrossAddressSpaceOrdering = false;
138 
139  // Limit the scope to the maximum supported by the instruction's address
140  // spaces.
141  if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
143  this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144  } else if ((InstrAddrSpace &
145  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
147  this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148  } else if ((InstrAddrSpace &
149  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151  this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152  }
153  }
154 
155 public:
156  /// \returns Atomic synchronization scope of the machine instruction used to
157  /// create this SIMemOpInfo.
158  SIAtomicScope getScope() const {
159  return Scope;
160  }
161 
162  /// \returns Ordering constraint of the machine instruction used to
163  /// create this SIMemOpInfo.
164  AtomicOrdering getOrdering() const {
165  return Ordering;
166  }
167 
168  /// \returns Failure ordering constraint of the machine instruction used to
169  /// create this SIMemOpInfo.
170  AtomicOrdering getFailureOrdering() const {
171  return FailureOrdering;
172  }
173 
174  /// \returns The address spaces be accessed by the machine
175  /// instruction used to create this SIMemOpInfo.
176  SIAtomicAddrSpace getInstrAddrSpace() const {
177  return InstrAddrSpace;
178  }
179 
180  /// \returns The address spaces that must be ordered by the machine
181  /// instruction used to create this SIMemOpInfo.
182  SIAtomicAddrSpace getOrderingAddrSpace() const {
183  return OrderingAddrSpace;
184  }
185 
186  /// \returns Return true iff memory ordering of operations on
187  /// different address spaces is required.
188  bool getIsCrossAddressSpaceOrdering() const {
189  return IsCrossAddressSpaceOrdering;
190  }
191 
192  /// \returns True if memory access of the machine instruction used to
193  /// create this SIMemOpInfo is volatile, false otherwise.
194  bool isVolatile() const {
195  return IsVolatile;
196  }
197 
198  /// \returns True if memory access of the machine instruction used to
199  /// create this SIMemOpInfo is nontemporal, false otherwise.
200  bool isNonTemporal() const {
201  return IsNonTemporal;
202  }
203 
204  /// \returns True if ordering constraint of the machine instruction used to
205  /// create this SIMemOpInfo is unordered or higher, false otherwise.
206  bool isAtomic() const {
207  return Ordering != AtomicOrdering::NotAtomic;
208  }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214  AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216  /// Reports unsupported message \p Msg for \p MI to LLVM context.
217  void reportUnsupported(const MachineBasicBlock::iterator &MI,
218  const char *Msg) const;
219 
220  /// Inspects the target synchronization scope \p SSID and determines
221  /// the SI atomic scope it corresponds to, the address spaces it
222  /// covers, and whether the memory ordering applies between address
223  /// spaces.
224  std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227  /// \return Return a bit set of the address spaces accessed by \p AS.
228  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230  /// \returns Info constructed from \p MI, which has at least machine memory
231  /// operand.
232  std::optional<SIMemOpInfo>
233  constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236  /// Construct class to support accessing the machine memory operands
237  /// of instructions in the machine function \p MF.
238  SIMemOpAccess(MachineFunction &MF);
239 
240  /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241  std::optional<SIMemOpInfo>
243 
244  /// \returns Store info if \p MI is a store operation, "std::nullopt"
245  /// otherwise.
246  std::optional<SIMemOpInfo>
247  getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249  /// \returns Atomic fence info if \p MI is an atomic fence operation,
250  /// "std::nullopt" otherwise.
251  std::optional<SIMemOpInfo>
252  getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255  /// rmw operation, "std::nullopt" otherwise.
256  std::optional<SIMemOpInfo>
257  getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263  /// AMDGPU subtarget info.
264  const GCNSubtarget &ST;
265 
266  /// Instruction info.
267  const SIInstrInfo *TII = nullptr;
268 
269  IsaVersion IV;
270 
271  /// Whether to insert cache invalidating instructions.
272  bool InsertCacheInv;
273 
274  SICacheControl(const GCNSubtarget &ST);
275 
276  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277  /// \returns Returns true if \p MI is modified, false otherwise.
278  bool enableNamedBit(const MachineBasicBlock::iterator MI,
279  AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283  /// Create a cache control for the subtarget \p ST.
284  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286  /// Update \p MI memory load instruction to bypass any caches up to
287  /// the \p Scope memory scope for address spaces \p
288  /// AddrSpace. Return true iff the instruction was modified.
289  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290  SIAtomicScope Scope,
291  SIAtomicAddrSpace AddrSpace) const = 0;
292 
293  /// Update \p MI memory store instruction to bypass any caches up to
294  /// the \p Scope memory scope for address spaces \p
295  /// AddrSpace. Return true iff the instruction was modified.
296  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297  SIAtomicScope Scope,
298  SIAtomicAddrSpace AddrSpace) const = 0;
299 
300  /// Update \p MI memory read-modify-write instruction to bypass any caches up
301  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302  /// iff the instruction was modified.
303  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304  SIAtomicScope Scope,
305  SIAtomicAddrSpace AddrSpace) const = 0;
306 
307  /// Update \p MI memory instruction of kind \p Op associated with address
308  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309  /// true iff the instruction was modified.
310  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311  SIAtomicAddrSpace AddrSpace,
312  SIMemOp Op, bool IsVolatile,
313  bool IsNonTemporal) const = 0;
314 
315  /// Inserts any necessary instructions at position \p Pos relative
316  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317  /// \p Op associated with address spaces \p AddrSpace have completed. Used
318  /// between memory instructions to enforce the order they become visible as
319  /// observed by other memory instructions executing in memory scope \p Scope.
320  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321  /// address spaces. Returns true iff any instructions inserted.
322  virtual bool insertWait(MachineBasicBlock::iterator &MI,
323  SIAtomicScope Scope,
324  SIAtomicAddrSpace AddrSpace,
325  SIMemOp Op,
326  bool IsCrossAddrSpaceOrdering,
327  Position Pos) const = 0;
328 
329  /// Inserts any necessary instructions at position \p Pos relative to
330  /// instruction \p MI to ensure any subsequent memory instructions of this
331  /// thread with address spaces \p AddrSpace will observe the previous memory
332  /// operations by any thread for memory scopes up to memory scope \p Scope .
333  /// Returns true iff any instructions inserted.
334  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335  SIAtomicScope Scope,
336  SIAtomicAddrSpace AddrSpace,
337  Position Pos) const = 0;
338 
339  /// Inserts any necessary instructions at position \p Pos relative to
340  /// instruction \p MI to ensure previous memory instructions by this thread
341  /// with address spaces \p AddrSpace have completed and can be observed by
342  /// subsequent memory instructions by any thread executing in memory scope \p
343  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344  /// between address spaces. Returns true iff any instructions inserted.
345  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346  SIAtomicScope Scope,
347  SIAtomicAddrSpace AddrSpace,
348  bool IsCrossAddrSpaceOrdering,
349  Position Pos) const = 0;
350 
351  /// Virtual destructor to allow derivations to be deleted.
352  virtual ~SICacheControl() = default;
353 
354 };
355 
356 class SIGfx6CacheControl : public SICacheControl {
357 protected:
358 
359  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
360  /// is modified, false otherwise.
361  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
362  return enableNamedBit(MI, AMDGPU::CPol::GLC);
363  }
364 
365  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
366  /// is modified, false otherwise.
367  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
368  return enableNamedBit(MI, AMDGPU::CPol::SLC);
369  }
370 
371 public:
372 
373  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
374 
375  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
376  SIAtomicScope Scope,
377  SIAtomicAddrSpace AddrSpace) const override;
378 
379  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
380  SIAtomicScope Scope,
381  SIAtomicAddrSpace AddrSpace) const override;
382 
383  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
384  SIAtomicScope Scope,
385  SIAtomicAddrSpace AddrSpace) const override;
386 
387  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
388  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
389  bool IsVolatile,
390  bool IsNonTemporal) const override;
391 
392  bool insertWait(MachineBasicBlock::iterator &MI,
393  SIAtomicScope Scope,
394  SIAtomicAddrSpace AddrSpace,
395  SIMemOp Op,
396  bool IsCrossAddrSpaceOrdering,
397  Position Pos) const override;
398 
399  bool insertAcquire(MachineBasicBlock::iterator &MI,
400  SIAtomicScope Scope,
401  SIAtomicAddrSpace AddrSpace,
402  Position Pos) const override;
403 
404  bool insertRelease(MachineBasicBlock::iterator &MI,
405  SIAtomicScope Scope,
406  SIAtomicAddrSpace AddrSpace,
407  bool IsCrossAddrSpaceOrdering,
408  Position Pos) const override;
409 };
410 
411 class SIGfx7CacheControl : public SIGfx6CacheControl {
412 public:
413 
414  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
415 
416  bool insertAcquire(MachineBasicBlock::iterator &MI,
417  SIAtomicScope Scope,
418  SIAtomicAddrSpace AddrSpace,
419  Position Pos) const override;
420 
421 };
422 
423 class SIGfx90ACacheControl : public SIGfx7CacheControl {
424 public:
425 
426  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
427 
428  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
429  SIAtomicScope Scope,
430  SIAtomicAddrSpace AddrSpace) const override;
431 
432  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
433  SIAtomicScope Scope,
434  SIAtomicAddrSpace AddrSpace) const override;
435 
436  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
437  SIAtomicScope Scope,
438  SIAtomicAddrSpace AddrSpace) const override;
439 
440  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
441  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
442  bool IsVolatile,
443  bool IsNonTemporal) const override;
444 
445  bool insertWait(MachineBasicBlock::iterator &MI,
446  SIAtomicScope Scope,
447  SIAtomicAddrSpace AddrSpace,
448  SIMemOp Op,
449  bool IsCrossAddrSpaceOrdering,
450  Position Pos) const override;
451 
452  bool insertAcquire(MachineBasicBlock::iterator &MI,
453  SIAtomicScope Scope,
454  SIAtomicAddrSpace AddrSpace,
455  Position Pos) const override;
456 
457  bool insertRelease(MachineBasicBlock::iterator &MI,
458  SIAtomicScope Scope,
459  SIAtomicAddrSpace AddrSpace,
460  bool IsCrossAddrSpaceOrdering,
461  Position Pos) const override;
462 };
463 
464 class SIGfx940CacheControl : public SIGfx90ACacheControl {
465 protected:
466 
467  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
468  /// is modified, false otherwise.
469  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
470  return enableNamedBit(MI, AMDGPU::CPol::SC0);
471  }
472 
473  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
474  /// is modified, false otherwise.
475  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
476  return enableNamedBit(MI, AMDGPU::CPol::SC1);
477  }
478 
479  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
480  /// is modified, false otherwise.
481  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
482  return enableNamedBit(MI, AMDGPU::CPol::NT);
483  }
484 
485 public:
486 
487  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
488 
489  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
490  SIAtomicScope Scope,
491  SIAtomicAddrSpace AddrSpace) const override;
492 
493  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
494  SIAtomicScope Scope,
495  SIAtomicAddrSpace AddrSpace) const override;
496 
497  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
498  SIAtomicScope Scope,
499  SIAtomicAddrSpace AddrSpace) const override;
500 
501  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
502  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
503  bool IsVolatile,
504  bool IsNonTemporal) const override;
505 
506  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
507  SIAtomicAddrSpace AddrSpace, Position Pos) const override;
508 
509  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
510  SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
511  Position Pos) const override;
512 };
513 
514 class SIGfx10CacheControl : public SIGfx7CacheControl {
515 protected:
516 
517  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
518  /// is modified, false otherwise.
519  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
520  return enableNamedBit(MI, AMDGPU::CPol::DLC);
521  }
522 
523 public:
524 
525  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
526 
527  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
528  SIAtomicScope Scope,
529  SIAtomicAddrSpace AddrSpace) const override;
530 
531  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
532  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
533  bool IsVolatile,
534  bool IsNonTemporal) const override;
535 
536  bool insertWait(MachineBasicBlock::iterator &MI,
537  SIAtomicScope Scope,
538  SIAtomicAddrSpace AddrSpace,
539  SIMemOp Op,
540  bool IsCrossAddrSpaceOrdering,
541  Position Pos) const override;
542 
543  bool insertAcquire(MachineBasicBlock::iterator &MI,
544  SIAtomicScope Scope,
545  SIAtomicAddrSpace AddrSpace,
546  Position Pos) const override;
547 };
548 
549 class SIGfx11CacheControl : public SIGfx10CacheControl {
550 public:
551  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
552 
553  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
554  SIAtomicScope Scope,
555  SIAtomicAddrSpace AddrSpace) const override;
556 
557  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
558  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
559  bool IsVolatile,
560  bool IsNonTemporal) const override;
561 };
562 
563 class SIMemoryLegalizer final : public MachineFunctionPass {
564 private:
565 
566  /// Cache Control.
567  std::unique_ptr<SICacheControl> CC = nullptr;
568 
569  /// List of atomic pseudo instructions.
570  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
571 
572  /// Return true iff instruction \p MI is a atomic instruction that
573  /// returns a result.
574  bool isAtomicRet(const MachineInstr &MI) const {
576  }
577 
578  /// Removes all processed atomic pseudo instructions from the current
579  /// function. Returns true if current function is modified, false otherwise.
580  bool removeAtomicPseudoMIs();
581 
582  /// Expands load operation \p MI. Returns true if instructions are
583  /// added/deleted or \p MI is modified, false otherwise.
584  bool expandLoad(const SIMemOpInfo &MOI,
586  /// Expands store operation \p MI. Returns true if instructions are
587  /// added/deleted or \p MI is modified, false otherwise.
588  bool expandStore(const SIMemOpInfo &MOI,
590  /// Expands atomic fence operation \p MI. Returns true if
591  /// instructions are added/deleted or \p MI is modified, false otherwise.
592  bool expandAtomicFence(const SIMemOpInfo &MOI,
594  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
595  /// instructions are added/deleted or \p MI is modified, false otherwise.
596  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
598 
599 public:
600  static char ID;
601 
602  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
603 
604  void getAnalysisUsage(AnalysisUsage &AU) const override {
605  AU.setPreservesCFG();
607  }
608 
609  StringRef getPassName() const override {
610  return PASS_NAME;
611  }
612 
613  bool runOnMachineFunction(MachineFunction &MF) override;
614 };
615 
616 } // end namespace anonymous
617 
618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
619  const char *Msg) const {
620  const Function &Func = MI->getParent()->getParent()->getFunction();
621  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
622  Func.getContext().diagnose(Diag);
623 }
624 
625 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
627  SIAtomicAddrSpace InstrAddrSpace) const {
628  if (SSID == SyncScope::System)
629  return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
630  if (SSID == MMI->getAgentSSID())
631  return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
632  if (SSID == MMI->getWorkgroupSSID())
633  return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
634  true);
635  if (SSID == MMI->getWavefrontSSID())
636  return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
637  true);
638  if (SSID == SyncScope::SingleThread)
639  return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
640  true);
641  if (SSID == MMI->getSystemOneAddressSpaceSSID())
642  return std::tuple(SIAtomicScope::SYSTEM,
643  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
644  if (SSID == MMI->getAgentOneAddressSpaceSSID())
645  return std::tuple(SIAtomicScope::AGENT,
646  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
647  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
648  return std::tuple(SIAtomicScope::WORKGROUP,
649  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
650  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
651  return std::tuple(SIAtomicScope::WAVEFRONT,
652  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
653  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
654  return std::tuple(SIAtomicScope::SINGLETHREAD,
655  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
656  return std::nullopt;
657 }
658 
659 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
660  if (AS == AMDGPUAS::FLAT_ADDRESS)
662  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
664  if (AS == AMDGPUAS::LOCAL_ADDRESS)
665  return SIAtomicAddrSpace::LDS;
666  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
667  return SIAtomicAddrSpace::SCRATCH;
668  if (AS == AMDGPUAS::REGION_ADDRESS)
669  return SIAtomicAddrSpace::GDS;
670 
671  return SIAtomicAddrSpace::OTHER;
672 }
673 
674 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
676 }
677 
678 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
679  const MachineBasicBlock::iterator &MI) const {
680  assert(MI->getNumMemOperands() > 0);
681 
684  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
685  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
686  bool IsNonTemporal = true;
687  bool IsVolatile = false;
688 
689  // Validator should check whether or not MMOs cover the entire set of
690  // locations accessed by the memory instruction.
691  for (const auto &MMO : MI->memoperands()) {
692  IsNonTemporal &= MMO->isNonTemporal();
693  IsVolatile |= MMO->isVolatile();
694  InstrAddrSpace |=
695  toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
696  AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
697  if (OpOrdering != AtomicOrdering::NotAtomic) {
698  const auto &IsSyncScopeInclusion =
699  MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
700  if (!IsSyncScopeInclusion) {
701  reportUnsupported(MI,
702  "Unsupported non-inclusive atomic synchronization scope");
703  return std::nullopt;
704  }
705 
706  SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
707  Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
708  assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
709  MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
710  FailureOrdering =
711  getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
712  }
713  }
714 
715  SIAtomicScope Scope = SIAtomicScope::NONE;
716  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
717  bool IsCrossAddressSpaceOrdering = false;
718  if (Ordering != AtomicOrdering::NotAtomic) {
719  auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
720  if (!ScopeOrNone) {
721  reportUnsupported(MI, "Unsupported atomic synchronization scope");
722  return std::nullopt;
723  }
724  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
725  *ScopeOrNone;
726  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
727  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
728  ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
729  reportUnsupported(MI, "Unsupported atomic address space");
730  return std::nullopt;
731  }
732  }
733  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
734  IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
735  IsNonTemporal);
736 }
737 
738 std::optional<SIMemOpInfo>
740  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
741 
742  if (!(MI->mayLoad() && !MI->mayStore()))
743  return std::nullopt;
744 
745  // Be conservative if there are no memory operands.
746  if (MI->getNumMemOperands() == 0)
747  return SIMemOpInfo();
748 
749  return constructFromMIWithMMO(MI);
750 }
751 
752 std::optional<SIMemOpInfo>
753 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
754  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
755 
756  if (!(!MI->mayLoad() && MI->mayStore()))
757  return std::nullopt;
758 
759  // Be conservative if there are no memory operands.
760  if (MI->getNumMemOperands() == 0)
761  return SIMemOpInfo();
762 
763  return constructFromMIWithMMO(MI);
764 }
765 
766 std::optional<SIMemOpInfo>
767 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
768  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
769 
770  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
771  return std::nullopt;
772 
773  AtomicOrdering Ordering =
774  static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
775 
776  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
777  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
778  if (!ScopeOrNone) {
779  reportUnsupported(MI, "Unsupported atomic synchronization scope");
780  return std::nullopt;
781  }
782 
783  SIAtomicScope Scope = SIAtomicScope::NONE;
784  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
785  bool IsCrossAddressSpaceOrdering = false;
786  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
787  *ScopeOrNone;
788 
789  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
790  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
791  reportUnsupported(MI, "Unsupported atomic address space");
792  return std::nullopt;
793  }
794 
795  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
796  IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
797 }
798 
799 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
800  const MachineBasicBlock::iterator &MI) const {
801  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
802 
803  if (!(MI->mayLoad() && MI->mayStore()))
804  return std::nullopt;
805 
806  // Be conservative if there are no memory operands.
807  if (MI->getNumMemOperands() == 0)
808  return SIMemOpInfo();
809 
810  return constructFromMIWithMMO(MI);
811 }
812 
813 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
814  TII = ST.getInstrInfo();
815  IV = getIsaVersion(ST.getCPU());
816  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
817 }
818 
819 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
820  AMDGPU::CPol::CPol Bit) const {
821  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
822  if (!CPol)
823  return false;
824 
825  CPol->setImm(CPol->getImm() | Bit);
826  return true;
827 }
828 
829 /* static */
830 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
831  GCNSubtarget::Generation Generation = ST.getGeneration();
832  if (ST.hasGFX940Insts())
833  return std::make_unique<SIGfx940CacheControl>(ST);
834  if (ST.hasGFX90AInsts())
835  return std::make_unique<SIGfx90ACacheControl>(ST);
836  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
837  return std::make_unique<SIGfx6CacheControl>(ST);
838  if (Generation < AMDGPUSubtarget::GFX10)
839  return std::make_unique<SIGfx7CacheControl>(ST);
840  if (Generation < AMDGPUSubtarget::GFX11)
841  return std::make_unique<SIGfx10CacheControl>(ST);
842  return std::make_unique<SIGfx11CacheControl>(ST);
843 }
844 
845 bool SIGfx6CacheControl::enableLoadCacheBypass(
847  SIAtomicScope Scope,
848  SIAtomicAddrSpace AddrSpace) const {
849  assert(MI->mayLoad() && !MI->mayStore());
850  bool Changed = false;
851 
852  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
853  switch (Scope) {
854  case SIAtomicScope::SYSTEM:
855  case SIAtomicScope::AGENT:
856  // Set L1 cache policy to MISS_EVICT.
857  // Note: there is no L2 cache bypass policy at the ISA level.
858  Changed |= enableGLCBit(MI);
859  break;
860  case SIAtomicScope::WORKGROUP:
861  case SIAtomicScope::WAVEFRONT:
862  case SIAtomicScope::SINGLETHREAD:
863  // No cache to bypass.
864  break;
865  default:
866  llvm_unreachable("Unsupported synchronization scope");
867  }
868  }
869 
870  /// The scratch address space does not need the global memory caches
871  /// to be bypassed as all memory operations by the same thread are
872  /// sequentially consistent, and no other thread can access scratch
873  /// memory.
874 
875  /// Other address spaces do not have a cache.
876 
877  return Changed;
878 }
879 
880 bool SIGfx6CacheControl::enableStoreCacheBypass(
882  SIAtomicScope Scope,
883  SIAtomicAddrSpace AddrSpace) const {
884  assert(!MI->mayLoad() && MI->mayStore());
885  bool Changed = false;
886 
887  /// The L1 cache is write through so does not need to be bypassed. There is no
888  /// bypass control for the L2 cache at the isa level.
889 
890  return Changed;
891 }
892 
893 bool SIGfx6CacheControl::enableRMWCacheBypass(
895  SIAtomicScope Scope,
896  SIAtomicAddrSpace AddrSpace) const {
897  assert(MI->mayLoad() && MI->mayStore());
898  bool Changed = false;
899 
900  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
901  /// bypassed, and the GLC bit is instead used to indicate if they are
902  /// return or no-return.
903  /// Note: there is no L2 cache coherent bypass control at the ISA level.
904 
905  return Changed;
906 }
907 
908 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
909  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
910  bool IsVolatile, bool IsNonTemporal) const {
911  // Only handle load and store, not atomic read-modify-write insructions. The
912  // latter use glc to indicate if the atomic returns a result and so must not
913  // be used for cache control.
914  assert(MI->mayLoad() ^ MI->mayStore());
915 
916  // Only update load and store, not LLVM IR atomic read-modify-write
917  // instructions. The latter are always marked as volatile so cannot sensibly
918  // handle it as do not want to pessimize all atomics. Also they do not support
919  // the nontemporal attribute.
921 
922  bool Changed = false;
923 
924  if (IsVolatile) {
925  // Set L1 cache policy to be MISS_EVICT for load instructions
926  // and MISS_LRU for store instructions.
927  // Note: there is no L2 cache bypass policy at the ISA level.
928  if (Op == SIMemOp::LOAD)
929  Changed |= enableGLCBit(MI);
930 
931  // Ensure operation has completed at system scope to cause all volatile
932  // operations to be visible outside the program in a global order. Do not
933  // request cross address space as only the global address space can be
934  // observable outside the program, so no need to cause a waitcnt for LDS
935  // address space operations.
936  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
937  Position::AFTER);
938 
939  return Changed;
940  }
941 
942  if (IsNonTemporal) {
943  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
944  // for both loads and stores, and the L2 cache policy to STREAM.
945  Changed |= enableGLCBit(MI);
946  Changed |= enableSLCBit(MI);
947  return Changed;
948  }
949 
950  return Changed;
951 }
952 
953 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
954  SIAtomicScope Scope,
955  SIAtomicAddrSpace AddrSpace,
956  SIMemOp Op,
957  bool IsCrossAddrSpaceOrdering,
958  Position Pos) const {
959  bool Changed = false;
960 
961  MachineBasicBlock &MBB = *MI->getParent();
962  DebugLoc DL = MI->getDebugLoc();
963 
964  if (Pos == Position::AFTER)
965  ++MI;
966 
967  bool VMCnt = false;
968  bool LGKMCnt = false;
969 
970  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
972  switch (Scope) {
973  case SIAtomicScope::SYSTEM:
974  case SIAtomicScope::AGENT:
975  VMCnt |= true;
976  break;
977  case SIAtomicScope::WORKGROUP:
978  case SIAtomicScope::WAVEFRONT:
979  case SIAtomicScope::SINGLETHREAD:
980  // The L1 cache keeps all memory operations in order for
981  // wavefronts in the same work-group.
982  break;
983  default:
984  llvm_unreachable("Unsupported synchronization scope");
985  }
986  }
987 
988  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
989  switch (Scope) {
990  case SIAtomicScope::SYSTEM:
991  case SIAtomicScope::AGENT:
992  case SIAtomicScope::WORKGROUP:
993  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
994  // not needed as LDS operations for all waves are executed in a total
995  // global ordering as observed by all waves. Required if also
996  // synchronizing with global/GDS memory as LDS operations could be
997  // reordered with respect to later global/GDS memory operations of the
998  // same wave.
999  LGKMCnt |= IsCrossAddrSpaceOrdering;
1000  break;
1001  case SIAtomicScope::WAVEFRONT:
1002  case SIAtomicScope::SINGLETHREAD:
1003  // The LDS keeps all memory operations in order for
1004  // the same wavefront.
1005  break;
1006  default:
1007  llvm_unreachable("Unsupported synchronization scope");
1008  }
1009  }
1010 
1011  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1012  switch (Scope) {
1013  case SIAtomicScope::SYSTEM:
1014  case SIAtomicScope::AGENT:
1015  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1016  // is not needed as GDS operations for all waves are executed in a total
1017  // global ordering as observed by all waves. Required if also
1018  // synchronizing with global/LDS memory as GDS operations could be
1019  // reordered with respect to later global/LDS memory operations of the
1020  // same wave.
1021  LGKMCnt |= IsCrossAddrSpaceOrdering;
1022  break;
1023  case SIAtomicScope::WORKGROUP:
1024  case SIAtomicScope::WAVEFRONT:
1025  case SIAtomicScope::SINGLETHREAD:
1026  // The GDS keeps all memory operations in order for
1027  // the same work-group.
1028  break;
1029  default:
1030  llvm_unreachable("Unsupported synchronization scope");
1031  }
1032  }
1033 
1034  if (VMCnt || LGKMCnt) {
1035  unsigned WaitCntImmediate =
1037  VMCnt ? 0 : getVmcntBitMask(IV),
1039  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1040  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1041  Changed = true;
1042  }
1043 
1044  if (Pos == Position::AFTER)
1045  --MI;
1046 
1047  return Changed;
1048 }
1049 
1050 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1051  SIAtomicScope Scope,
1052  SIAtomicAddrSpace AddrSpace,
1053  Position Pos) const {
1054  if (!InsertCacheInv)
1055  return false;
1056 
1057  bool Changed = false;
1058 
1059  MachineBasicBlock &MBB = *MI->getParent();
1060  DebugLoc DL = MI->getDebugLoc();
1061 
1062  if (Pos == Position::AFTER)
1063  ++MI;
1064 
1065  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1066  switch (Scope) {
1067  case SIAtomicScope::SYSTEM:
1068  case SIAtomicScope::AGENT:
1069  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1070  Changed = true;
1071  break;
1072  case SIAtomicScope::WORKGROUP:
1073  case SIAtomicScope::WAVEFRONT:
1074  case SIAtomicScope::SINGLETHREAD:
1075  // No cache to invalidate.
1076  break;
1077  default:
1078  llvm_unreachable("Unsupported synchronization scope");
1079  }
1080  }
1081 
1082  /// The scratch address space does not need the global memory cache
1083  /// to be flushed as all memory operations by the same thread are
1084  /// sequentially consistent, and no other thread can access scratch
1085  /// memory.
1086 
1087  /// Other address spaces do not have a cache.
1088 
1089  if (Pos == Position::AFTER)
1090  --MI;
1091 
1092  return Changed;
1093 }
1094 
1095 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1096  SIAtomicScope Scope,
1097  SIAtomicAddrSpace AddrSpace,
1098  bool IsCrossAddrSpaceOrdering,
1099  Position Pos) const {
1100  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1101  IsCrossAddrSpaceOrdering, Pos);
1102 }
1103 
1104 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1105  SIAtomicScope Scope,
1106  SIAtomicAddrSpace AddrSpace,
1107  Position Pos) const {
1108  if (!InsertCacheInv)
1109  return false;
1110 
1111  bool Changed = false;
1112 
1113  MachineBasicBlock &MBB = *MI->getParent();
1114  DebugLoc DL = MI->getDebugLoc();
1115 
1116  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1117 
1118  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1119  ? AMDGPU::BUFFER_WBINVL1
1120  : AMDGPU::BUFFER_WBINVL1_VOL;
1121 
1122  if (Pos == Position::AFTER)
1123  ++MI;
1124 
1125  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1126  switch (Scope) {
1127  case SIAtomicScope::SYSTEM:
1128  case SIAtomicScope::AGENT:
1129  BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1130  Changed = true;
1131  break;
1132  case SIAtomicScope::WORKGROUP:
1133  case SIAtomicScope::WAVEFRONT:
1134  case SIAtomicScope::SINGLETHREAD:
1135  // No cache to invalidate.
1136  break;
1137  default:
1138  llvm_unreachable("Unsupported synchronization scope");
1139  }
1140  }
1141 
1142  /// The scratch address space does not need the global memory cache
1143  /// to be flushed as all memory operations by the same thread are
1144  /// sequentially consistent, and no other thread can access scratch
1145  /// memory.
1146 
1147  /// Other address spaces do not have a cache.
1148 
1149  if (Pos == Position::AFTER)
1150  --MI;
1151 
1152  return Changed;
1153 }
1154 
1155 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1157  SIAtomicScope Scope,
1158  SIAtomicAddrSpace AddrSpace) const {
1159  assert(MI->mayLoad() && !MI->mayStore());
1160  bool Changed = false;
1161 
1162  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1163  switch (Scope) {
1164  case SIAtomicScope::SYSTEM:
1165  case SIAtomicScope::AGENT:
1166  // Set the L1 cache policy to MISS_LRU.
1167  // Note: there is no L2 cache bypass policy at the ISA level.
1168  Changed |= enableGLCBit(MI);
1169  break;
1170  case SIAtomicScope::WORKGROUP:
1171  // In threadgroup split mode the waves of a work-group can be executing on
1172  // different CUs. Therefore need to bypass the L1 which is per CU.
1173  // Otherwise in non-threadgroup split mode all waves of a work-group are
1174  // on the same CU, and so the L1 does not need to be bypassed.
1175  if (ST.isTgSplitEnabled())
1176  Changed |= enableGLCBit(MI);
1177  break;
1178  case SIAtomicScope::WAVEFRONT:
1179  case SIAtomicScope::SINGLETHREAD:
1180  // No cache to bypass.
1181  break;
1182  default:
1183  llvm_unreachable("Unsupported synchronization scope");
1184  }
1185  }
1186 
1187  /// The scratch address space does not need the global memory caches
1188  /// to be bypassed as all memory operations by the same thread are
1189  /// sequentially consistent, and no other thread can access scratch
1190  /// memory.
1191 
1192  /// Other address spaces do not have a cache.
1193 
1194  return Changed;
1195 }
1196 
1197 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1199  SIAtomicScope Scope,
1200  SIAtomicAddrSpace AddrSpace) const {
1201  assert(!MI->mayLoad() && MI->mayStore());
1202  bool Changed = false;
1203 
1204  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205  switch (Scope) {
1206  case SIAtomicScope::SYSTEM:
1207  case SIAtomicScope::AGENT:
1208  /// Do not set glc for store atomic operations as they implicitly write
1209  /// through the L1 cache.
1210  break;
1211  case SIAtomicScope::WORKGROUP:
1212  case SIAtomicScope::WAVEFRONT:
1213  case SIAtomicScope::SINGLETHREAD:
1214  // No cache to bypass. Store atomics implicitly write through the L1
1215  // cache.
1216  break;
1217  default:
1218  llvm_unreachable("Unsupported synchronization scope");
1219  }
1220  }
1221 
1222  /// The scratch address space does not need the global memory caches
1223  /// to be bypassed as all memory operations by the same thread are
1224  /// sequentially consistent, and no other thread can access scratch
1225  /// memory.
1226 
1227  /// Other address spaces do not have a cache.
1228 
1229  return Changed;
1230 }
1231 
1232 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1234  SIAtomicScope Scope,
1235  SIAtomicAddrSpace AddrSpace) const {
1236  assert(MI->mayLoad() && MI->mayStore());
1237  bool Changed = false;
1238 
1239  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1240  switch (Scope) {
1241  case SIAtomicScope::SYSTEM:
1242  case SIAtomicScope::AGENT:
1243  /// Do not set glc for RMW atomic operations as they implicitly bypass
1244  /// the L1 cache, and the glc bit is instead used to indicate if they are
1245  /// return or no-return.
1246  break;
1247  case SIAtomicScope::WORKGROUP:
1248  case SIAtomicScope::WAVEFRONT:
1249  case SIAtomicScope::SINGLETHREAD:
1250  // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1251  break;
1252  default:
1253  llvm_unreachable("Unsupported synchronization scope");
1254  }
1255  }
1256 
1257  return Changed;
1258 }
1259 
1260 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1261  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1262  bool IsVolatile, bool IsNonTemporal) const {
1263  // Only handle load and store, not atomic read-modify-write insructions. The
1264  // latter use glc to indicate if the atomic returns a result and so must not
1265  // be used for cache control.
1266  assert(MI->mayLoad() ^ MI->mayStore());
1267 
1268  // Only update load and store, not LLVM IR atomic read-modify-write
1269  // instructions. The latter are always marked as volatile so cannot sensibly
1270  // handle it as do not want to pessimize all atomics. Also they do not support
1271  // the nontemporal attribute.
1273 
1274  bool Changed = false;
1275 
1276  if (IsVolatile) {
1277  // Set L1 cache policy to be MISS_EVICT for load instructions
1278  // and MISS_LRU for store instructions.
1279  // Note: there is no L2 cache bypass policy at the ISA level.
1280  if (Op == SIMemOp::LOAD)
1281  Changed |= enableGLCBit(MI);
1282 
1283  // Ensure operation has completed at system scope to cause all volatile
1284  // operations to be visible outside the program in a global order. Do not
1285  // request cross address space as only the global address space can be
1286  // observable outside the program, so no need to cause a waitcnt for LDS
1287  // address space operations.
1288  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1289  Position::AFTER);
1290 
1291  return Changed;
1292  }
1293 
1294  if (IsNonTemporal) {
1295  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1296  // for both loads and stores, and the L2 cache policy to STREAM.
1297  Changed |= enableGLCBit(MI);
1298  Changed |= enableSLCBit(MI);
1299  return Changed;
1300  }
1301 
1302  return Changed;
1303 }
1304 
1305 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1306  SIAtomicScope Scope,
1307  SIAtomicAddrSpace AddrSpace,
1308  SIMemOp Op,
1309  bool IsCrossAddrSpaceOrdering,
1310  Position Pos) const {
1311  if (ST.isTgSplitEnabled()) {
1312  // In threadgroup split mode the waves of a work-group can be executing on
1313  // different CUs. Therefore need to wait for global or GDS memory operations
1314  // to complete to ensure they are visible to waves in the other CUs.
1315  // Otherwise in non-threadgroup split mode all waves of a work-group are on
1316  // the same CU, so no need to wait for global memory as all waves in the
1317  // work-group access the same the L1, nor wait for GDS as access are ordered
1318  // on a CU.
1319  if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1320  SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1321  (Scope == SIAtomicScope::WORKGROUP)) {
1322  // Same as GFX7 using agent scope.
1323  Scope = SIAtomicScope::AGENT;
1324  }
1325  // In threadgroup split mode LDS cannot be allocated so no need to wait for
1326  // LDS memory operations.
1327  AddrSpace &= ~SIAtomicAddrSpace::LDS;
1328  }
1329  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1330  IsCrossAddrSpaceOrdering, Pos);
1331 }
1332 
1333 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1334  SIAtomicScope Scope,
1335  SIAtomicAddrSpace AddrSpace,
1336  Position Pos) const {
1337  if (!InsertCacheInv)
1338  return false;
1339 
1340  bool Changed = false;
1341 
1342  MachineBasicBlock &MBB = *MI->getParent();
1343  DebugLoc DL = MI->getDebugLoc();
1344 
1345  if (Pos == Position::AFTER)
1346  ++MI;
1347 
1348  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1349  switch (Scope) {
1350  case SIAtomicScope::SYSTEM:
1351  // Ensures that following loads will not see stale remote VMEM data or
1352  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1353  // CC will never be stale due to the local memory probes.
1354  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1355  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1356  // hardware does not reorder memory operations by the same wave with
1357  // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1358  // remove any cache lines of earlier writes by the same wave and ensures
1359  // later reads by the same wave will refetch the cache lines.
1360  Changed = true;
1361  break;
1362  case SIAtomicScope::AGENT:
1363  // Same as GFX7.
1364  break;
1365  case SIAtomicScope::WORKGROUP:
1366  // In threadgroup split mode the waves of a work-group can be executing on
1367  // different CUs. Therefore need to invalidate the L1 which is per CU.
1368  // Otherwise in non-threadgroup split mode all waves of a work-group are
1369  // on the same CU, and so the L1 does not need to be invalidated.
1370  if (ST.isTgSplitEnabled()) {
1371  // Same as GFX7 using agent scope.
1372  Scope = SIAtomicScope::AGENT;
1373  }
1374  break;
1375  case SIAtomicScope::WAVEFRONT:
1376  case SIAtomicScope::SINGLETHREAD:
1377  // Same as GFX7.
1378  break;
1379  default:
1380  llvm_unreachable("Unsupported synchronization scope");
1381  }
1382  }
1383 
1384  /// The scratch address space does not need the global memory cache
1385  /// to be flushed as all memory operations by the same thread are
1386  /// sequentially consistent, and no other thread can access scratch
1387  /// memory.
1388 
1389  /// Other address spaces do not have a cache.
1390 
1391  if (Pos == Position::AFTER)
1392  --MI;
1393 
1394  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1395 
1396  return Changed;
1397 }
1398 
1399 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1400  SIAtomicScope Scope,
1401  SIAtomicAddrSpace AddrSpace,
1402  bool IsCrossAddrSpaceOrdering,
1403  Position Pos) const {
1404  bool Changed = false;
1405 
1406  MachineBasicBlock &MBB = *MI->getParent();
1407  DebugLoc DL = MI->getDebugLoc();
1408 
1409  if (Pos == Position::AFTER)
1410  ++MI;
1411 
1412  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1413  switch (Scope) {
1414  case SIAtomicScope::SYSTEM:
1415  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1416  // hardware does not reorder memory operations by the same wave with
1417  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1418  // to initiate writeback of any dirty cache lines of earlier writes by the
1419  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1420  // writeback has completed.
1421  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1422  // Set SC bits to indicate system scope.
1424  // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1425  // vmcnt(0)" needed by the "BUFFER_WBL2".
1426  Changed = true;
1427  break;
1428  case SIAtomicScope::AGENT:
1429  case SIAtomicScope::WORKGROUP:
1430  case SIAtomicScope::WAVEFRONT:
1431  case SIAtomicScope::SINGLETHREAD:
1432  // Same as GFX7.
1433  break;
1434  default:
1435  llvm_unreachable("Unsupported synchronization scope");
1436  }
1437  }
1438 
1439  if (Pos == Position::AFTER)
1440  --MI;
1441 
1442  Changed |=
1443  SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1444  IsCrossAddrSpaceOrdering, Pos);
1445 
1446  return Changed;
1447 }
1448 
1449 bool SIGfx940CacheControl::enableLoadCacheBypass(
1450  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1451  SIAtomicAddrSpace AddrSpace) const {
1452  assert(MI->mayLoad() && !MI->mayStore());
1453  bool Changed = false;
1454 
1455  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1456  switch (Scope) {
1457  case SIAtomicScope::SYSTEM:
1458  // Set SC bits to indicate system scope.
1459  Changed |= enableSC0Bit(MI);
1460  Changed |= enableSC1Bit(MI);
1461  break;
1462  case SIAtomicScope::AGENT:
1463  // Set SC bits to indicate agent scope.
1464  Changed |= enableSC1Bit(MI);
1465  break;
1466  case SIAtomicScope::WORKGROUP:
1467  // In threadgroup split mode the waves of a work-group can be executing on
1468  // different CUs. Therefore need to bypass the L1 which is per CU.
1469  // Otherwise in non-threadgroup split mode all waves of a work-group are
1470  // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1471  // bits to indicate work-group scope will do this automatically.
1472  Changed |= enableSC0Bit(MI);
1473  break;
1474  case SIAtomicScope::WAVEFRONT:
1475  case SIAtomicScope::SINGLETHREAD:
1476  // Leave SC bits unset to indicate wavefront scope.
1477  break;
1478  default:
1479  llvm_unreachable("Unsupported synchronization scope");
1480  }
1481  }
1482 
1483  /// The scratch address space does not need the global memory caches
1484  /// to be bypassed as all memory operations by the same thread are
1485  /// sequentially consistent, and no other thread can access scratch
1486  /// memory.
1487 
1488  /// Other address spaces do not have a cache.
1489 
1490  return Changed;
1491 }
1492 
1493 bool SIGfx940CacheControl::enableStoreCacheBypass(
1495  SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1496  assert(!MI->mayLoad() && MI->mayStore());
1497  bool Changed = false;
1498 
1499  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1500  switch (Scope) {
1501  case SIAtomicScope::SYSTEM:
1502  // Set SC bits to indicate system scope.
1503  Changed |= enableSC0Bit(MI);
1504  Changed |= enableSC1Bit(MI);
1505  break;
1506  case SIAtomicScope::AGENT:
1507  // Set SC bits to indicate agent scope.
1508  Changed |= enableSC1Bit(MI);
1509  break;
1510  case SIAtomicScope::WORKGROUP:
1511  // Set SC bits to indicate workgroup scope.
1512  Changed |= enableSC0Bit(MI);
1513  break;
1514  case SIAtomicScope::WAVEFRONT:
1515  case SIAtomicScope::SINGLETHREAD:
1516  // Leave SC bits unset to indicate wavefront scope.
1517  break;
1518  default:
1519  llvm_unreachable("Unsupported synchronization scope");
1520  }
1521  }
1522 
1523  /// The scratch address space does not need the global memory caches
1524  /// to be bypassed as all memory operations by the same thread are
1525  /// sequentially consistent, and no other thread can access scratch
1526  /// memory.
1527 
1528  /// Other address spaces do not have a cache.
1529 
1530  return Changed;
1531 }
1532 
1533 bool SIGfx940CacheControl::enableRMWCacheBypass(
1534  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1535  SIAtomicAddrSpace AddrSpace) const {
1536  assert(MI->mayLoad() && MI->mayStore());
1537  bool Changed = false;
1538 
1539  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1540  switch (Scope) {
1541  case SIAtomicScope::SYSTEM:
1542  // Set SC1 bit to indicate system scope.
1543  Changed |= enableSC1Bit(MI);
1544  break;
1545  case SIAtomicScope::AGENT:
1546  case SIAtomicScope::WORKGROUP:
1547  case SIAtomicScope::WAVEFRONT:
1548  case SIAtomicScope::SINGLETHREAD:
1549  // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1550  // to indicate system or agent scope. The SC0 bit is used to indicate if
1551  // they are return or no-return. Leave SC1 bit unset to indicate agent
1552  // scope.
1553  break;
1554  default:
1555  llvm_unreachable("Unsupported synchronization scope");
1556  }
1557  }
1558 
1559  return Changed;
1560 }
1561 
1562 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1563  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1564  bool IsVolatile, bool IsNonTemporal) const {
1565  // Only handle load and store, not atomic read-modify-write insructions. The
1566  // latter use glc to indicate if the atomic returns a result and so must not
1567  // be used for cache control.
1568  assert(MI->mayLoad() ^ MI->mayStore());
1569 
1570  // Only update load and store, not LLVM IR atomic read-modify-write
1571  // instructions. The latter are always marked as volatile so cannot sensibly
1572  // handle it as do not want to pessimize all atomics. Also they do not support
1573  // the nontemporal attribute.
1575 
1576  bool Changed = false;
1577 
1578  if (IsVolatile) {
1579  // Set SC bits to indicate system scope.
1580  Changed |= enableSC0Bit(MI);
1581  Changed |= enableSC1Bit(MI);
1582 
1583  // Ensure operation has completed at system scope to cause all volatile
1584  // operations to be visible outside the program in a global order. Do not
1585  // request cross address space as only the global address space can be
1586  // observable outside the program, so no need to cause a waitcnt for LDS
1587  // address space operations.
1588  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1589  Position::AFTER);
1590 
1591  return Changed;
1592  }
1593 
1594  if (IsNonTemporal) {
1595  Changed |= enableNTBit(MI);
1596  return Changed;
1597  }
1598 
1599  return Changed;
1600 }
1601 
1602 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1603  SIAtomicScope Scope,
1604  SIAtomicAddrSpace AddrSpace,
1605  Position Pos) const {
1606  if (!InsertCacheInv)
1607  return false;
1608 
1609  bool Changed = false;
1610 
1611  MachineBasicBlock &MBB = *MI->getParent();
1612  DebugLoc DL = MI->getDebugLoc();
1613 
1614  if (Pos == Position::AFTER)
1615  ++MI;
1616 
1617  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1618  switch (Scope) {
1619  case SIAtomicScope::SYSTEM:
1620  // Ensures that following loads will not see stale remote VMEM data or
1621  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1622  // CC will never be stale due to the local memory probes.
1623  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1624  // Set SC bits to indicate system scope.
1626  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1627  // hardware does not reorder memory operations by the same wave with
1628  // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1629  // remove any cache lines of earlier writes by the same wave and ensures
1630  // later reads by the same wave will refetch the cache lines.
1631  Changed = true;
1632  break;
1633  case SIAtomicScope::AGENT:
1634  // Ensures that following loads will not see stale remote date or local
1635  // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1636  // due to the memory probes.
1637  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1638  // Set SC bits to indicate agent scope.
1640  // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1641  // does not reorder memory operations with respect to preceeding buffer
1642  // invalidate. The invalidate is guaranteed to remove any cache lines of
1643  // earlier writes and ensures later writes will refetch the cache lines.
1644  Changed = true;
1645  break;
1646  case SIAtomicScope::WORKGROUP:
1647  // In threadgroup split mode the waves of a work-group can be executing on
1648  // different CUs. Therefore need to invalidate the L1 which is per CU.
1649  // Otherwise in non-threadgroup split mode all waves of a work-group are
1650  // on the same CU, and so the L1 does not need to be invalidated.
1651  if (ST.isTgSplitEnabled()) {
1652  // Ensures L1 is invalidated if in threadgroup split mode. In
1653  // non-threadgroup split mode it is a NOP, but no point generating it in
1654  // that case if know not in that mode.
1655  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1656  // Set SC bits to indicate work-group scope.
1658  // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1659  // does not reorder memory operations with respect to preceeding buffer
1660  // invalidate. The invalidate is guaranteed to remove any cache lines of
1661  // earlier writes and ensures later writes will refetch the cache lines.
1662  Changed = true;
1663  }
1664  break;
1665  case SIAtomicScope::WAVEFRONT:
1666  case SIAtomicScope::SINGLETHREAD:
1667  // Could generate "BUFFER_INV" but it would do nothing as there are no
1668  // caches to invalidate.
1669  break;
1670  default:
1671  llvm_unreachable("Unsupported synchronization scope");
1672  }
1673  }
1674 
1675  /// The scratch address space does not need the global memory cache
1676  /// to be flushed as all memory operations by the same thread are
1677  /// sequentially consistent, and no other thread can access scratch
1678  /// memory.
1679 
1680  /// Other address spaces do not have a cache.
1681 
1682  if (Pos == Position::AFTER)
1683  --MI;
1684 
1685  return Changed;
1686 }
1687 
1688 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1689  SIAtomicScope Scope,
1690  SIAtomicAddrSpace AddrSpace,
1691  bool IsCrossAddrSpaceOrdering,
1692  Position Pos) const {
1693  bool Changed = false;
1694 
1695  MachineBasicBlock &MBB = *MI->getParent();
1696  DebugLoc DL = MI->getDebugLoc();
1697 
1698  if (Pos == Position::AFTER)
1699  ++MI;
1700 
1701  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1702  switch (Scope) {
1703  case SIAtomicScope::SYSTEM:
1704  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1705  // hardware does not reorder memory operations by the same wave with
1706  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1707  // to initiate writeback of any dirty cache lines of earlier writes by the
1708  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1709  // writeback has completed.
1710  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1711  // Set SC bits to indicate system scope.
1713  // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1714  // SIAtomicScope::SYSTEM, the following insertWait will generate the
1715  // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1716  Changed = true;
1717  break;
1718  case SIAtomicScope::AGENT:
1719  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1720  // Set SC bits to indicate agent scope.
1722 
1723  // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1724  // SIAtomicScope::AGENT, the following insertWait will generate the
1725  // required "S_WAITCNT vmcnt(0)".
1726  Changed = true;
1727  break;
1728  case SIAtomicScope::WORKGROUP:
1729  case SIAtomicScope::WAVEFRONT:
1730  case SIAtomicScope::SINGLETHREAD:
1731  // Do not generate "BUFFER_WBL2" as there are no caches it would
1732  // writeback, and would require an otherwise unnecessary
1733  // "S_WAITCNT vmcnt(0)".
1734  break;
1735  default:
1736  llvm_unreachable("Unsupported synchronization scope");
1737  }
1738  }
1739 
1740  if (Pos == Position::AFTER)
1741  --MI;
1742 
1743  // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1744  // S_WAITCNT needed.
1745  Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1746  IsCrossAddrSpaceOrdering, Pos);
1747 
1748  return Changed;
1749 }
1750 
1751 bool SIGfx10CacheControl::enableLoadCacheBypass(
1753  SIAtomicScope Scope,
1754  SIAtomicAddrSpace AddrSpace) const {
1755  assert(MI->mayLoad() && !MI->mayStore());
1756  bool Changed = false;
1757 
1758  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1759  switch (Scope) {
1760  case SIAtomicScope::SYSTEM:
1761  case SIAtomicScope::AGENT:
1762  // Set the L0 and L1 cache policies to MISS_EVICT.
1763  // Note: there is no L2 cache coherent bypass control at the ISA level.
1764  Changed |= enableGLCBit(MI);
1765  Changed |= enableDLCBit(MI);
1766  break;
1767  case SIAtomicScope::WORKGROUP:
1768  // In WGP mode the waves of a work-group can be executing on either CU of
1769  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1770  // CU mode all waves of a work-group are on the same CU, and so the L0
1771  // does not need to be bypassed.
1772  if (!ST.isCuModeEnabled())
1773  Changed |= enableGLCBit(MI);
1774  break;
1775  case SIAtomicScope::WAVEFRONT:
1776  case SIAtomicScope::SINGLETHREAD:
1777  // No cache to bypass.
1778  break;
1779  default:
1780  llvm_unreachable("Unsupported synchronization scope");
1781  }
1782  }
1783 
1784  /// The scratch address space does not need the global memory caches
1785  /// to be bypassed as all memory operations by the same thread are
1786  /// sequentially consistent, and no other thread can access scratch
1787  /// memory.
1788 
1789  /// Other address spaces do not have a cache.
1790 
1791  return Changed;
1792 }
1793 
1794 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1795  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1796  bool IsVolatile, bool IsNonTemporal) const {
1797 
1798  // Only handle load and store, not atomic read-modify-write insructions. The
1799  // latter use glc to indicate if the atomic returns a result and so must not
1800  // be used for cache control.
1801  assert(MI->mayLoad() ^ MI->mayStore());
1802 
1803  // Only update load and store, not LLVM IR atomic read-modify-write
1804  // instructions. The latter are always marked as volatile so cannot sensibly
1805  // handle it as do not want to pessimize all atomics. Also they do not support
1806  // the nontemporal attribute.
1808 
1809  bool Changed = false;
1810 
1811  if (IsVolatile) {
1812  // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1813  // and MISS_LRU for store instructions.
1814  // Note: there is no L2 cache coherent bypass control at the ISA level.
1815  if (Op == SIMemOp::LOAD) {
1816  Changed |= enableGLCBit(MI);
1817  Changed |= enableDLCBit(MI);
1818  }
1819 
1820  // Ensure operation has completed at system scope to cause all volatile
1821  // operations to be visible outside the program in a global order. Do not
1822  // request cross address space as only the global address space can be
1823  // observable outside the program, so no need to cause a waitcnt for LDS
1824  // address space operations.
1825  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1826  Position::AFTER);
1827  return Changed;
1828  }
1829 
1830  if (IsNonTemporal) {
1831  // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1832  // and L2 cache policy to STREAM.
1833  // For stores setting both GLC and SLC configures L0 and L1 cache policy
1834  // to MISS_EVICT and the L2 cache policy to STREAM.
1835  if (Op == SIMemOp::STORE)
1836  Changed |= enableGLCBit(MI);
1837  Changed |= enableSLCBit(MI);
1838 
1839  return Changed;
1840  }
1841 
1842  return Changed;
1843 }
1844 
1845 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1846  SIAtomicScope Scope,
1847  SIAtomicAddrSpace AddrSpace,
1848  SIMemOp Op,
1849  bool IsCrossAddrSpaceOrdering,
1850  Position Pos) const {
1851  bool Changed = false;
1852 
1853  MachineBasicBlock &MBB = *MI->getParent();
1854  DebugLoc DL = MI->getDebugLoc();
1855 
1856  if (Pos == Position::AFTER)
1857  ++MI;
1858 
1859  bool VMCnt = false;
1860  bool VSCnt = false;
1861  bool LGKMCnt = false;
1862 
1863  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1865  switch (Scope) {
1866  case SIAtomicScope::SYSTEM:
1867  case SIAtomicScope::AGENT:
1868  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1869  VMCnt |= true;
1870  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1871  VSCnt |= true;
1872  break;
1873  case SIAtomicScope::WORKGROUP:
1874  // In WGP mode the waves of a work-group can be executing on either CU of
1875  // the WGP. Therefore need to wait for operations to complete to ensure
1876  // they are visible to waves in the other CU as the L0 is per CU.
1877  // Otherwise in CU mode and all waves of a work-group are on the same CU
1878  // which shares the same L0.
1879  if (!ST.isCuModeEnabled()) {
1880  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1881  VMCnt |= true;
1882  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1883  VSCnt |= true;
1884  }
1885  break;
1886  case SIAtomicScope::WAVEFRONT:
1887  case SIAtomicScope::SINGLETHREAD:
1888  // The L0 cache keeps all memory operations in order for
1889  // work-items in the same wavefront.
1890  break;
1891  default:
1892  llvm_unreachable("Unsupported synchronization scope");
1893  }
1894  }
1895 
1896  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1897  switch (Scope) {
1898  case SIAtomicScope::SYSTEM:
1899  case SIAtomicScope::AGENT:
1900  case SIAtomicScope::WORKGROUP:
1901  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1902  // not needed as LDS operations for all waves are executed in a total
1903  // global ordering as observed by all waves. Required if also
1904  // synchronizing with global/GDS memory as LDS operations could be
1905  // reordered with respect to later global/GDS memory operations of the
1906  // same wave.
1907  LGKMCnt |= IsCrossAddrSpaceOrdering;
1908  break;
1909  case SIAtomicScope::WAVEFRONT:
1910  case SIAtomicScope::SINGLETHREAD:
1911  // The LDS keeps all memory operations in order for
1912  // the same wavefront.
1913  break;
1914  default:
1915  llvm_unreachable("Unsupported synchronization scope");
1916  }
1917  }
1918 
1919  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1920  switch (Scope) {
1921  case SIAtomicScope::SYSTEM:
1922  case SIAtomicScope::AGENT:
1923  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1924  // is not needed as GDS operations for all waves are executed in a total
1925  // global ordering as observed by all waves. Required if also
1926  // synchronizing with global/LDS memory as GDS operations could be
1927  // reordered with respect to later global/LDS memory operations of the
1928  // same wave.
1929  LGKMCnt |= IsCrossAddrSpaceOrdering;
1930  break;
1931  case SIAtomicScope::WORKGROUP:
1932  case SIAtomicScope::WAVEFRONT:
1933  case SIAtomicScope::SINGLETHREAD:
1934  // The GDS keeps all memory operations in order for
1935  // the same work-group.
1936  break;
1937  default:
1938  llvm_unreachable("Unsupported synchronization scope");
1939  }
1940  }
1941 
1942  if (VMCnt || LGKMCnt) {
1943  unsigned WaitCntImmediate =
1945  VMCnt ? 0 : getVmcntBitMask(IV),
1947  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1948  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1949  Changed = true;
1950  }
1951 
1952  if (VSCnt) {
1953  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1954  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1955  .addImm(0);
1956  Changed = true;
1957  }
1958 
1959  if (Pos == Position::AFTER)
1960  --MI;
1961 
1962  return Changed;
1963 }
1964 
1965 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1966  SIAtomicScope Scope,
1967  SIAtomicAddrSpace AddrSpace,
1968  Position Pos) const {
1969  if (!InsertCacheInv)
1970  return false;
1971 
1972  bool Changed = false;
1973 
1974  MachineBasicBlock &MBB = *MI->getParent();
1975  DebugLoc DL = MI->getDebugLoc();
1976 
1977  if (Pos == Position::AFTER)
1978  ++MI;
1979 
1980  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1981  switch (Scope) {
1982  case SIAtomicScope::SYSTEM:
1983  case SIAtomicScope::AGENT:
1984  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1985  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1986  Changed = true;
1987  break;
1988  case SIAtomicScope::WORKGROUP:
1989  // In WGP mode the waves of a work-group can be executing on either CU of
1990  // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1991  // in CU mode and all waves of a work-group are on the same CU, and so the
1992  // L0 does not need to be invalidated.
1993  if (!ST.isCuModeEnabled()) {
1994  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1995  Changed = true;
1996  }
1997  break;
1998  case SIAtomicScope::WAVEFRONT:
1999  case SIAtomicScope::SINGLETHREAD:
2000  // No cache to invalidate.
2001  break;
2002  default:
2003  llvm_unreachable("Unsupported synchronization scope");
2004  }
2005  }
2006 
2007  /// The scratch address space does not need the global memory cache
2008  /// to be flushed as all memory operations by the same thread are
2009  /// sequentially consistent, and no other thread can access scratch
2010  /// memory.
2011 
2012  /// Other address spaces do not have a cache.
2013 
2014  if (Pos == Position::AFTER)
2015  --MI;
2016 
2017  return Changed;
2018 }
2019 
2020 bool SIGfx11CacheControl::enableLoadCacheBypass(
2021  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2022  SIAtomicAddrSpace AddrSpace) const {
2023  assert(MI->mayLoad() && !MI->mayStore());
2024  bool Changed = false;
2025 
2026  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2027  switch (Scope) {
2028  case SIAtomicScope::SYSTEM:
2029  case SIAtomicScope::AGENT:
2030  // Set the L0 and L1 cache policies to MISS_EVICT.
2031  // Note: there is no L2 cache coherent bypass control at the ISA level.
2032  Changed |= enableGLCBit(MI);
2033  break;
2034  case SIAtomicScope::WORKGROUP:
2035  // In WGP mode the waves of a work-group can be executing on either CU of
2036  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2037  // CU mode all waves of a work-group are on the same CU, and so the L0
2038  // does not need to be bypassed.
2039  if (!ST.isCuModeEnabled())
2040  Changed |= enableGLCBit(MI);
2041  break;
2042  case SIAtomicScope::WAVEFRONT:
2043  case SIAtomicScope::SINGLETHREAD:
2044  // No cache to bypass.
2045  break;
2046  default:
2047  llvm_unreachable("Unsupported synchronization scope");
2048  }
2049  }
2050 
2051  /// The scratch address space does not need the global memory caches
2052  /// to be bypassed as all memory operations by the same thread are
2053  /// sequentially consistent, and no other thread can access scratch
2054  /// memory.
2055 
2056  /// Other address spaces do not have a cache.
2057 
2058  return Changed;
2059 }
2060 
2061 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2062  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2063  bool IsVolatile, bool IsNonTemporal) const {
2064 
2065  // Only handle load and store, not atomic read-modify-write insructions. The
2066  // latter use glc to indicate if the atomic returns a result and so must not
2067  // be used for cache control.
2068  assert(MI->mayLoad() ^ MI->mayStore());
2069 
2070  // Only update load and store, not LLVM IR atomic read-modify-write
2071  // instructions. The latter are always marked as volatile so cannot sensibly
2072  // handle it as do not want to pessimize all atomics. Also they do not support
2073  // the nontemporal attribute.
2075 
2076  bool Changed = false;
2077 
2078  if (IsVolatile) {
2079  // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2080  // and MISS_LRU for store instructions.
2081  // Note: there is no L2 cache coherent bypass control at the ISA level.
2082  if (Op == SIMemOp::LOAD)
2083  Changed |= enableGLCBit(MI);
2084 
2085  // Set MALL NOALLOC for load and store instructions.
2086  Changed |= enableDLCBit(MI);
2087 
2088  // Ensure operation has completed at system scope to cause all volatile
2089  // operations to be visible outside the program in a global order. Do not
2090  // request cross address space as only the global address space can be
2091  // observable outside the program, so no need to cause a waitcnt for LDS
2092  // address space operations.
2093  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2094  Position::AFTER);
2095  return Changed;
2096  }
2097 
2098  if (IsNonTemporal) {
2099  // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2100  // and L2 cache policy to STREAM.
2101  // For stores setting both GLC and SLC configures L0 and L1 cache policy
2102  // to MISS_EVICT and the L2 cache policy to STREAM.
2103  if (Op == SIMemOp::STORE)
2104  Changed |= enableGLCBit(MI);
2105  Changed |= enableSLCBit(MI);
2106 
2107  // Set MALL NOALLOC for load and store instructions.
2108  Changed |= enableDLCBit(MI);
2109  return Changed;
2110  }
2111 
2112  return Changed;
2113 }
2114 
2115 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2116  if (AtomicPseudoMIs.empty())
2117  return false;
2118 
2119  for (auto &MI : AtomicPseudoMIs)
2120  MI->eraseFromParent();
2121 
2122  AtomicPseudoMIs.clear();
2123  return true;
2124 }
2125 
2126 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2128  assert(MI->mayLoad() && !MI->mayStore());
2129 
2130  bool Changed = false;
2131 
2132  if (MOI.isAtomic()) {
2133  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2134  MOI.getOrdering() == AtomicOrdering::Acquire ||
2135  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2136  Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2137  MOI.getOrderingAddrSpace());
2138  }
2139 
2140  if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2141  Changed |= CC->insertWait(MI, MOI.getScope(),
2142  MOI.getOrderingAddrSpace(),
2144  MOI.getIsCrossAddressSpaceOrdering(),
2145  Position::BEFORE);
2146 
2147  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2148  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2149  Changed |= CC->insertWait(MI, MOI.getScope(),
2150  MOI.getInstrAddrSpace(),
2151  SIMemOp::LOAD,
2152  MOI.getIsCrossAddressSpaceOrdering(),
2153  Position::AFTER);
2154  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2155  MOI.getOrderingAddrSpace(),
2156  Position::AFTER);
2157  }
2158 
2159  return Changed;
2160  }
2161 
2162  // Atomic instructions already bypass caches to the scope specified by the
2163  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2164  // need additional treatment.
2165  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2166  SIMemOp::LOAD, MOI.isVolatile(),
2167  MOI.isNonTemporal());
2168  return Changed;
2169 }
2170 
2171 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2173  assert(!MI->mayLoad() && MI->mayStore());
2174 
2175  bool Changed = false;
2176 
2177  if (MOI.isAtomic()) {
2178  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2179  MOI.getOrdering() == AtomicOrdering::Release ||
2180  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2181  Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2182  MOI.getOrderingAddrSpace());
2183  }
2184 
2185  if (MOI.getOrdering() == AtomicOrdering::Release ||
2186  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2187  Changed |= CC->insertRelease(MI, MOI.getScope(),
2188  MOI.getOrderingAddrSpace(),
2189  MOI.getIsCrossAddressSpaceOrdering(),
2190  Position::BEFORE);
2191 
2192  return Changed;
2193  }
2194 
2195  // Atomic instructions already bypass caches to the scope specified by the
2196  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2197  // need additional treatment.
2198  Changed |= CC->enableVolatileAndOrNonTemporal(
2199  MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2200  MOI.isNonTemporal());
2201  return Changed;
2202 }
2203 
2204 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2206  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2207 
2208  AtomicPseudoMIs.push_back(MI);
2209  bool Changed = false;
2210 
2211  if (MOI.isAtomic()) {
2212  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2213  MOI.getOrdering() == AtomicOrdering::Release ||
2214  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2215  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2216  /// TODO: This relies on a barrier always generating a waitcnt
2217  /// for LDS to ensure it is not reordered with the completion of
2218  /// the proceeding LDS operations. If barrier had a memory
2219  /// ordering and memory scope, then library does not need to
2220  /// generate a fence. Could add support in this file for
2221  /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2222  /// adding S_WAITCNT before a S_BARRIER.
2223  Changed |= CC->insertRelease(MI, MOI.getScope(),
2224  MOI.getOrderingAddrSpace(),
2225  MOI.getIsCrossAddressSpaceOrdering(),
2226  Position::BEFORE);
2227 
2228  // TODO: If both release and invalidate are happening they could be combined
2229  // to use the single "BUFFER_WBINV*" instruction. This could be done by
2230  // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2231  // track cache invalidate and write back instructions.
2232 
2233  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2234  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2235  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2236  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2237  MOI.getOrderingAddrSpace(),
2238  Position::BEFORE);
2239 
2240  return Changed;
2241  }
2242 
2243  return Changed;
2244 }
2245 
2246 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2248  assert(MI->mayLoad() && MI->mayStore());
2249 
2250  bool Changed = false;
2251 
2252  if (MOI.isAtomic()) {
2253  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2254  MOI.getOrdering() == AtomicOrdering::Acquire ||
2255  MOI.getOrdering() == AtomicOrdering::Release ||
2256  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2257  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2258  Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2259  MOI.getInstrAddrSpace());
2260  }
2261 
2262  if (MOI.getOrdering() == AtomicOrdering::Release ||
2263  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2264  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2265  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2266  Changed |= CC->insertRelease(MI, MOI.getScope(),
2267  MOI.getOrderingAddrSpace(),
2268  MOI.getIsCrossAddressSpaceOrdering(),
2269  Position::BEFORE);
2270 
2271  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2272  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2273  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2274  MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2275  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2276  Changed |= CC->insertWait(MI, MOI.getScope(),
2277  MOI.getInstrAddrSpace(),
2278  isAtomicRet(*MI) ? SIMemOp::LOAD :
2280  MOI.getIsCrossAddressSpaceOrdering(),
2281  Position::AFTER);
2282  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2283  MOI.getOrderingAddrSpace(),
2284  Position::AFTER);
2285  }
2286 
2287  return Changed;
2288  }
2289 
2290  return Changed;
2291 }
2292 
2293 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2294  bool Changed = false;
2295 
2296  SIMemOpAccess MOA(MF);
2297  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2298 
2299  for (auto &MBB : MF) {
2300  for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2301 
2302  // Unbundle instructions after the post-RA scheduler.
2303  if (MI->isBundle() && MI->mayLoadOrStore()) {
2304  MachineBasicBlock::instr_iterator II(MI->getIterator());
2306  I != E && I->isBundledWithPred(); ++I) {
2307  I->unbundleFromPred();
2308  for (MachineOperand &MO : I->operands())
2309  if (MO.isReg())
2310  MO.setIsInternalRead(false);
2311  }
2312 
2313  MI->eraseFromParent();
2314  MI = II->getIterator();
2315  }
2316 
2317  if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2318  continue;
2319 
2320  if (const auto &MOI = MOA.getLoadInfo(MI))
2321  Changed |= expandLoad(*MOI, MI);
2322  else if (const auto &MOI = MOA.getStoreInfo(MI))
2323  Changed |= expandStore(*MOI, MI);
2324  else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2325  Changed |= expandAtomicFence(*MOI, MI);
2326  else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2327  Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2328  }
2329  }
2330 
2331  Changed |= removeAtomicPseudoMIs();
2332  return Changed;
2333 }
2334 
2335 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2336 
2337 char SIMemoryLegalizer::ID = 0;
2338 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2339 
2341  return new SIMemoryLegalizer();
2342 }
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:579
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
GFX10
@ GFX10
Definition: SIInstrInfo.cpp:7999
llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition: SIMemoryLegalizer.cpp:2340
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:375
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1008
AtomicOrdering.h
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:193
llvm::Function
Definition: Function.h:59
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
llvm::ALL
@ ALL
Definition: Attributor.h:5571
llvm::getMergedAtomicOrdering
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
Definition: AtomicOrdering.h:138
AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
TargetParser.h
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
MachineBasicBlock.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:138
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1233
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:113
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:309
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPU::CPol::DLC
@ DLC
Definition: SIDefines.h:312
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:130
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::AMDGPU::CPol::NT
@ NT
Definition: SIDefines.h:316
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:33
llvm::AMDGPU::CPol::SC0
@ SC0
Definition: SIDefines.h:314
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition: SIMemoryLegalizer.cpp:2338
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:59
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
getScope
static SPIRV::Scope::Scope getScope(SyncScope::ID Ord)
Definition: SPIRVInstructionSelector.cpp:551
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:371
llvm::MachineFunction::getMMI
MachineModuleInfo & getMMI() const
Definition: MachineFunction.h:623
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:672
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt< bool >
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1239
AMDGPUMCTargetDesc.h
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:310
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::ISD::ATOMIC_FENCE
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1157
llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition: SIDefines.h:85
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:965
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:376
llvm::SyncScope::ID
uint8_t ID
Definition: LLVMContext.h:46
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
llvm::AMDGPUISD::LDS
@ LDS
Definition: AMDGPUISelLowering.h:487
MachineFunctionPass.h
AMDGPUMachineModuleInfo.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:265
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIMemoryLegalizer.cpp:30
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:295
llvm::MachineFunction
Definition: MachineFunction.h:258
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
AMDGPU.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
BitmaskEnum.h
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
CC
auto CC
Definition: RISCVRedundantCopyElimination.cpp:79
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:199
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::AMDGPU::SendMsg::Msg
const CustomOperand< const MCSubtargetInfo & > Msg[]
Definition: AMDGPUAsmUtils.cpp:39
llvm::AtomicOrdering::Release
@ Release
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:1313
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:354
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPU::CPol::SLC
@ SLC
Definition: SIDefines.h:311
DiagnosticInfo.h
GFX11
@ GFX11
Definition: SIInstrInfo.cpp:8003
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:966
llvm::CSKYAttrs::NONE
@ NONE
Definition: CSKYAttributes.h:76
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1243
llvm::NVPTX::PTXLdStInstCode::GLOBAL
@ GLOBAL
Definition: NVPTX.h:110
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:372
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:309
llvm::tgtok::Bit
@ Bit
Definition: TGLexer.h:50
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:370
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
PASS_NAME
#define PASS_NAME
Definition: SIMemoryLegalizer.cpp:31
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPU::CPol::SC1
@ SC1
Definition: SIDefines.h:315
llvm::AMDGPUMachineModuleInfo
Definition: AMDGPUMachineModuleInfo.h:22
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:411
llvm::MachineInstrBundleIterator< MachineInstr >
LDS
AMDGPU promote alloca to vector or LDS
Definition: AMDGPUPromoteAlloca.cpp:137
getLoadInfo
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition: AArch64FalkorHWPFFix.cpp:235
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:311
llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
Definition: MachineModuleInfo.h:166
llvm::AtomicOrdering::NotAtomic
@ NotAtomic
llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:126