LLVM  14.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
20 #include "llvm/ADT/BitmaskEnum.h"
22 #include "llvm/IR/DiagnosticInfo.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
33  "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34  cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42  NONE = 0u,
43  LOAD = 1u << 0,
44  STORE = 1u << 1,
45  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51  BEFORE,
52  AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57  NONE,
58  SINGLETHREAD,
59  WAVEFRONT,
60  WORKGROUP,
61  AGENT,
62  SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68  NONE = 0u,
69  GLOBAL = 1u << 0,
70  LDS = 1u << 1,
71  SCRATCH = 1u << 2,
72  GDS = 1u << 3,
73  OTHER = 1u << 4,
74 
75  /// The address spaces that can be accessed by a FLAT instruction.
76  FLAT = GLOBAL | LDS | SCRATCH,
77 
78  /// The address spaces that support atomic instructions.
79  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81  /// All address spaces.
82  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90  friend class SIMemOpAccess;
91 
93  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97  bool IsCrossAddressSpaceOrdering = false;
98  bool IsVolatile = false;
99  bool IsNonTemporal = false;
100 
102  SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105  bool IsCrossAddressSpaceOrdering = true,
106  AtomicOrdering FailureOrdering =
108  bool IsVolatile = false,
109  bool IsNonTemporal = false)
110  : Ordering(Ordering), FailureOrdering(FailureOrdering),
111  Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112  InstrAddrSpace(InstrAddrSpace),
113  IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115  IsNonTemporal(IsNonTemporal) {
116 
117  if (Ordering == AtomicOrdering::NotAtomic) {
119  OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120  !IsCrossAddressSpaceOrdering &&
121  FailureOrdering == AtomicOrdering::NotAtomic);
122  return;
123  }
124 
126  (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128  (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 
131  // There is also no cross address space ordering if the ordering
132  // address space is the same as the instruction address space and
133  // only contains a single address space.
134  if ((OrderingAddrSpace == InstrAddrSpace) &&
135  isPowerOf2_32(uint32_t(InstrAddrSpace)))
136  this->IsCrossAddressSpaceOrdering = false;
137 
138  // Limit the scope to the maximum supported by the instruction's address
139  // spaces.
140  if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142  this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143  } else if ((InstrAddrSpace &
144  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146  this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147  } else if ((InstrAddrSpace &
148  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150  this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151  }
152  }
153 
154 public:
155  /// \returns Atomic synchronization scope of the machine instruction used to
156  /// create this SIMemOpInfo.
157  SIAtomicScope getScope() const {
158  return Scope;
159  }
160 
161  /// \returns Ordering constraint of the machine instruction used to
162  /// create this SIMemOpInfo.
163  AtomicOrdering getOrdering() const {
164  return Ordering;
165  }
166 
167  /// \returns Failure ordering constraint of the machine instruction used to
168  /// create this SIMemOpInfo.
169  AtomicOrdering getFailureOrdering() const {
170  return FailureOrdering;
171  }
172 
173  /// \returns The address spaces be accessed by the machine
174  /// instruction used to create this SiMemOpInfo.
175  SIAtomicAddrSpace getInstrAddrSpace() const {
176  return InstrAddrSpace;
177  }
178 
179  /// \returns The address spaces that must be ordered by the machine
180  /// instruction used to create this SiMemOpInfo.
181  SIAtomicAddrSpace getOrderingAddrSpace() const {
182  return OrderingAddrSpace;
183  }
184 
185  /// \returns Return true iff memory ordering of operations on
186  /// different address spaces is required.
187  bool getIsCrossAddressSpaceOrdering() const {
188  return IsCrossAddressSpaceOrdering;
189  }
190 
191  /// \returns True if memory access of the machine instruction used to
192  /// create this SIMemOpInfo is volatile, false otherwise.
193  bool isVolatile() const {
194  return IsVolatile;
195  }
196 
197  /// \returns True if memory access of the machine instruction used to
198  /// create this SIMemOpInfo is nontemporal, false otherwise.
199  bool isNonTemporal() const {
200  return IsNonTemporal;
201  }
202 
203  /// \returns True if ordering constraint of the machine instruction used to
204  /// create this SIMemOpInfo is unordered or higher, false otherwise.
205  bool isAtomic() const {
206  return Ordering != AtomicOrdering::NotAtomic;
207  }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213  AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215  /// Reports unsupported message \p Msg for \p MI to LLVM context.
216  void reportUnsupported(const MachineBasicBlock::iterator &MI,
217  const char *Msg) const;
218 
219  /// Inspects the target synchronization scope \p SSID and determines
220  /// the SI atomic scope it corresponds to, the address spaces it
221  /// covers, and whether the memory ordering applies between address
222  /// spaces.
224  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226  /// \return Return a bit set of the address spaces accessed by \p AS.
227  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229  /// \returns Info constructed from \p MI, which has at least machine memory
230  /// operand.
231  Optional<SIMemOpInfo> constructFromMIWithMMO(
232  const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235  /// Construct class to support accessing the machine memory operands
236  /// of instructions in the machine function \p MF.
237  SIMemOpAccess(MachineFunction &MF);
238 
239  /// \returns Load info if \p MI is a load operation, "None" otherwise.
241  const MachineBasicBlock::iterator &MI) const;
242 
243  /// \returns Store info if \p MI is a store operation, "None" otherwise.
244  Optional<SIMemOpInfo> getStoreInfo(
245  const MachineBasicBlock::iterator &MI) const;
246 
247  /// \returns Atomic fence info if \p MI is an atomic fence operation,
248  /// "None" otherwise.
249  Optional<SIMemOpInfo> getAtomicFenceInfo(
250  const MachineBasicBlock::iterator &MI) const;
251 
252  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253  /// rmw operation, "None" otherwise.
254  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255  const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261  /// AMDGPU subtarget info.
262  const GCNSubtarget &ST;
263 
264  /// Instruction info.
265  const SIInstrInfo *TII = nullptr;
266 
267  IsaVersion IV;
268 
269  /// Whether to insert cache invalidating instructions.
270  bool InsertCacheInv;
271 
272  SICacheControl(const GCNSubtarget &ST);
273 
274  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275  /// \returns Returns true if \p MI is modified, false otherwise.
276  bool enableNamedBit(const MachineBasicBlock::iterator MI,
277  AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281  /// Create a cache control for the subtarget \p ST.
282  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284  /// Update \p MI memory load instruction to bypass any caches up to
285  /// the \p Scope memory scope for address spaces \p
286  /// AddrSpace. Return true iff the instruction was modified.
287  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288  SIAtomicScope Scope,
289  SIAtomicAddrSpace AddrSpace) const = 0;
290 
291  /// Update \p MI memory store instruction to bypass any caches up to
292  /// the \p Scope memory scope for address spaces \p
293  /// AddrSpace. Return true iff the instruction was modified.
294  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295  SIAtomicScope Scope,
296  SIAtomicAddrSpace AddrSpace) const = 0;
297 
298  /// Update \p MI memory read-modify-write instruction to bypass any caches up
299  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300  /// iff the instruction was modified.
301  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302  SIAtomicScope Scope,
303  SIAtomicAddrSpace AddrSpace) const = 0;
304 
305  /// Update \p MI memory instruction of kind \p Op associated with address
306  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307  /// true iff the instruction was modified.
308  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309  SIAtomicAddrSpace AddrSpace,
310  SIMemOp Op, bool IsVolatile,
311  bool IsNonTemporal) const = 0;
312 
313  /// Inserts any necessary instructions at position \p Pos relative
314  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315  /// \p Op associated with address spaces \p AddrSpace have completed. Used
316  /// between memory instructions to enforce the order they become visible as
317  /// observed by other memory instructions executing in memory scope \p Scope.
318  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319  /// address spaces. Returns true iff any instructions inserted.
320  virtual bool insertWait(MachineBasicBlock::iterator &MI,
321  SIAtomicScope Scope,
322  SIAtomicAddrSpace AddrSpace,
323  SIMemOp Op,
324  bool IsCrossAddrSpaceOrdering,
325  Position Pos) const = 0;
326 
327  /// Inserts any necessary instructions at position \p Pos relative to
328  /// instruction \p MI to ensure any subsequent memory instructions of this
329  /// thread with address spaces \p AddrSpace will observe the previous memory
330  /// operations by any thread for memory scopes up to memory scope \p Scope .
331  /// Returns true iff any instructions inserted.
332  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333  SIAtomicScope Scope,
334  SIAtomicAddrSpace AddrSpace,
335  Position Pos) const = 0;
336 
337  /// Inserts any necessary instructions at position \p Pos relative to
338  /// instruction \p MI to ensure previous memory instructions by this thread
339  /// with address spaces \p AddrSpace have completed and can be observed by
340  /// subsequent memory instructions by any thread executing in memory scope \p
341  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342  /// between address spaces. Returns true iff any instructions inserted.
343  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344  SIAtomicScope Scope,
345  SIAtomicAddrSpace AddrSpace,
346  bool IsCrossAddrSpaceOrdering,
347  Position Pos) const = 0;
348 
349  /// Virtual destructor to allow derivations to be deleted.
350  virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358  /// is modified, false otherwise.
359  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360  return enableNamedBit(MI, AMDGPU::CPol::GLC);
361  }
362 
363  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364  /// is modified, false otherwise.
365  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366  return enableNamedBit(MI, AMDGPU::CPol::SLC);
367  }
368 
369 public:
370 
371  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
372 
373  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374  SIAtomicScope Scope,
375  SIAtomicAddrSpace AddrSpace) const override;
376 
377  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378  SIAtomicScope Scope,
379  SIAtomicAddrSpace AddrSpace) const override;
380 
381  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382  SIAtomicScope Scope,
383  SIAtomicAddrSpace AddrSpace) const override;
384 
385  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387  bool IsVolatile,
388  bool IsNonTemporal) const override;
389 
390  bool insertWait(MachineBasicBlock::iterator &MI,
391  SIAtomicScope Scope,
392  SIAtomicAddrSpace AddrSpace,
393  SIMemOp Op,
394  bool IsCrossAddrSpaceOrdering,
395  Position Pos) const override;
396 
397  bool insertAcquire(MachineBasicBlock::iterator &MI,
398  SIAtomicScope Scope,
399  SIAtomicAddrSpace AddrSpace,
400  Position Pos) const override;
401 
402  bool insertRelease(MachineBasicBlock::iterator &MI,
403  SIAtomicScope Scope,
404  SIAtomicAddrSpace AddrSpace,
405  bool IsCrossAddrSpaceOrdering,
406  Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
413 
414  bool insertAcquire(MachineBasicBlock::iterator &MI,
415  SIAtomicScope Scope,
416  SIAtomicAddrSpace AddrSpace,
417  Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
425 
426  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427  SIAtomicScope Scope,
428  SIAtomicAddrSpace AddrSpace) const override;
429 
430  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431  SIAtomicScope Scope,
432  SIAtomicAddrSpace AddrSpace) const override;
433 
434  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435  SIAtomicScope Scope,
436  SIAtomicAddrSpace AddrSpace) const override;
437 
438  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440  bool IsVolatile,
441  bool IsNonTemporal) const override;
442 
443  bool insertWait(MachineBasicBlock::iterator &MI,
444  SIAtomicScope Scope,
445  SIAtomicAddrSpace AddrSpace,
446  SIMemOp Op,
447  bool IsCrossAddrSpaceOrdering,
448  Position Pos) const override;
449 
450  bool insertAcquire(MachineBasicBlock::iterator &MI,
451  SIAtomicScope Scope,
452  SIAtomicAddrSpace AddrSpace,
453  Position Pos) const override;
454 
455  bool insertRelease(MachineBasicBlock::iterator &MI,
456  SIAtomicScope Scope,
457  SIAtomicAddrSpace AddrSpace,
458  bool IsCrossAddrSpaceOrdering,
459  Position Pos) const override;
460 };
461 
462 class SIGfx10CacheControl : public SIGfx7CacheControl {
463 protected:
464 
465  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
466  /// is modified, false otherwise.
467  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
468  return enableNamedBit(MI, AMDGPU::CPol::DLC);
469  }
470 
471 public:
472 
473  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
474 
475  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
476  SIAtomicScope Scope,
477  SIAtomicAddrSpace AddrSpace) const override;
478 
479  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481  bool IsVolatile,
482  bool IsNonTemporal) const override;
483 
484  bool insertWait(MachineBasicBlock::iterator &MI,
485  SIAtomicScope Scope,
486  SIAtomicAddrSpace AddrSpace,
487  SIMemOp Op,
488  bool IsCrossAddrSpaceOrdering,
489  Position Pos) const override;
490 
491  bool insertAcquire(MachineBasicBlock::iterator &MI,
492  SIAtomicScope Scope,
493  SIAtomicAddrSpace AddrSpace,
494  Position Pos) const override;
495 };
496 
497 class SIMemoryLegalizer final : public MachineFunctionPass {
498 private:
499 
500  /// Cache Control.
501  std::unique_ptr<SICacheControl> CC = nullptr;
502 
503  /// List of atomic pseudo instructions.
504  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
505 
506  /// Return true iff instruction \p MI is a atomic instruction that
507  /// returns a result.
508  bool isAtomicRet(const MachineInstr &MI) const {
510  }
511 
512  /// Removes all processed atomic pseudo instructions from the current
513  /// function. Returns true if current function is modified, false otherwise.
514  bool removeAtomicPseudoMIs();
515 
516  /// Expands load operation \p MI. Returns true if instructions are
517  /// added/deleted or \p MI is modified, false otherwise.
518  bool expandLoad(const SIMemOpInfo &MOI,
520  /// Expands store operation \p MI. Returns true if instructions are
521  /// added/deleted or \p MI is modified, false otherwise.
522  bool expandStore(const SIMemOpInfo &MOI,
524  /// Expands atomic fence operation \p MI. Returns true if
525  /// instructions are added/deleted or \p MI is modified, false otherwise.
526  bool expandAtomicFence(const SIMemOpInfo &MOI,
528  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
529  /// instructions are added/deleted or \p MI is modified, false otherwise.
530  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
532 
533 public:
534  static char ID;
535 
536  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
537 
538  void getAnalysisUsage(AnalysisUsage &AU) const override {
539  AU.setPreservesCFG();
541  }
542 
543  StringRef getPassName() const override {
544  return PASS_NAME;
545  }
546 
547  bool runOnMachineFunction(MachineFunction &MF) override;
548 };
549 
550 } // end namespace anonymous
551 
552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
553  const char *Msg) const {
554  const Function &Func = MI->getParent()->getParent()->getFunction();
555  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
556  Func.getContext().diagnose(Diag);
557 }
558 
560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
561  SIAtomicAddrSpace InstrAddrSpace) const {
562  if (SSID == SyncScope::System)
563  return std::make_tuple(SIAtomicScope::SYSTEM,
564  SIAtomicAddrSpace::ATOMIC,
565  true);
566  if (SSID == MMI->getAgentSSID())
567  return std::make_tuple(SIAtomicScope::AGENT,
568  SIAtomicAddrSpace::ATOMIC,
569  true);
570  if (SSID == MMI->getWorkgroupSSID())
571  return std::make_tuple(SIAtomicScope::WORKGROUP,
572  SIAtomicAddrSpace::ATOMIC,
573  true);
574  if (SSID == MMI->getWavefrontSSID())
575  return std::make_tuple(SIAtomicScope::WAVEFRONT,
576  SIAtomicAddrSpace::ATOMIC,
577  true);
578  if (SSID == SyncScope::SingleThread)
579  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
580  SIAtomicAddrSpace::ATOMIC,
581  true);
582  if (SSID == MMI->getSystemOneAddressSpaceSSID())
583  return std::make_tuple(SIAtomicScope::SYSTEM,
584  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
585  false);
586  if (SSID == MMI->getAgentOneAddressSpaceSSID())
587  return std::make_tuple(SIAtomicScope::AGENT,
588  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
589  false);
590  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
591  return std::make_tuple(SIAtomicScope::WORKGROUP,
592  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
593  false);
594  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
595  return std::make_tuple(SIAtomicScope::WAVEFRONT,
596  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
597  false);
598  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
599  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
601  false);
602  return None;
603 }
604 
605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
606  if (AS == AMDGPUAS::FLAT_ADDRESS)
608  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
610  if (AS == AMDGPUAS::LOCAL_ADDRESS)
611  return SIAtomicAddrSpace::LDS;
612  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
613  return SIAtomicAddrSpace::SCRATCH;
614  if (AS == AMDGPUAS::REGION_ADDRESS)
615  return SIAtomicAddrSpace::GDS;
616 
617  return SIAtomicAddrSpace::OTHER;
618 }
619 
620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
622 }
623 
624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
625  const MachineBasicBlock::iterator &MI) const {
626  assert(MI->getNumMemOperands() > 0);
627 
630  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
631  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
632  bool IsNonTemporal = true;
633  bool IsVolatile = false;
634 
635  // Validator should check whether or not MMOs cover the entire set of
636  // locations accessed by the memory instruction.
637  for (const auto &MMO : MI->memoperands()) {
638  IsNonTemporal &= MMO->isNonTemporal();
639  IsVolatile |= MMO->isVolatile();
640  InstrAddrSpace |=
641  toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
642  AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
643  if (OpOrdering != AtomicOrdering::NotAtomic) {
644  const auto &IsSyncScopeInclusion =
645  MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
646  if (!IsSyncScopeInclusion) {
647  reportUnsupported(MI,
648  "Unsupported non-inclusive atomic synchronization scope");
649  return None;
650  }
651 
652  SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
653  Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
654  assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
655  MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
656  FailureOrdering =
657  getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
658  }
659  }
660 
661  SIAtomicScope Scope = SIAtomicScope::NONE;
662  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
663  bool IsCrossAddressSpaceOrdering = false;
664  if (Ordering != AtomicOrdering::NotAtomic) {
665  auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
666  if (!ScopeOrNone) {
667  reportUnsupported(MI, "Unsupported atomic synchronization scope");
668  return None;
669  }
670  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
671  ScopeOrNone.getValue();
672  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
673  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
674  ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
675  reportUnsupported(MI, "Unsupported atomic address space");
676  return None;
677  }
678  }
679  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
680  IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
681  IsNonTemporal);
682 }
683 
685  const MachineBasicBlock::iterator &MI) const {
686  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
687 
688  if (!(MI->mayLoad() && !MI->mayStore()))
689  return None;
690 
691  // Be conservative if there are no memory operands.
692  if (MI->getNumMemOperands() == 0)
693  return SIMemOpInfo();
694 
695  return constructFromMIWithMMO(MI);
696 }
697 
698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
699  const MachineBasicBlock::iterator &MI) const {
700  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
701 
702  if (!(!MI->mayLoad() && MI->mayStore()))
703  return None;
704 
705  // Be conservative if there are no memory operands.
706  if (MI->getNumMemOperands() == 0)
707  return SIMemOpInfo();
708 
709  return constructFromMIWithMMO(MI);
710 }
711 
712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
713  const MachineBasicBlock::iterator &MI) const {
714  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
715 
716  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
717  return None;
718 
719  AtomicOrdering Ordering =
720  static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
721 
722  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
723  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
724  if (!ScopeOrNone) {
725  reportUnsupported(MI, "Unsupported atomic synchronization scope");
726  return None;
727  }
728 
729  SIAtomicScope Scope = SIAtomicScope::NONE;
730  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
731  bool IsCrossAddressSpaceOrdering = false;
732  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
733  ScopeOrNone.getValue();
734 
735  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
736  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
737  reportUnsupported(MI, "Unsupported atomic address space");
738  return None;
739  }
740 
741  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
742  IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
743 }
744 
745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
746  const MachineBasicBlock::iterator &MI) const {
747  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
748 
749  if (!(MI->mayLoad() && MI->mayStore()))
750  return None;
751 
752  // Be conservative if there are no memory operands.
753  if (MI->getNumMemOperands() == 0)
754  return SIMemOpInfo();
755 
756  return constructFromMIWithMMO(MI);
757 }
758 
759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
760  TII = ST.getInstrInfo();
761  IV = getIsaVersion(ST.getCPU());
762  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
763 }
764 
765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
766  AMDGPU::CPol::CPol Bit) const {
767  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
768  if (!CPol)
769  return false;
770 
771  CPol->setImm(CPol->getImm() | Bit);
772  return true;
773 }
774 
775 /* static */
776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
777  GCNSubtarget::Generation Generation = ST.getGeneration();
778  if (ST.hasGFX90AInsts())
779  return std::make_unique<SIGfx90ACacheControl>(ST);
780  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
781  return std::make_unique<SIGfx6CacheControl>(ST);
782  if (Generation < AMDGPUSubtarget::GFX10)
783  return std::make_unique<SIGfx7CacheControl>(ST);
784  return std::make_unique<SIGfx10CacheControl>(ST);
785 }
786 
787 bool SIGfx6CacheControl::enableLoadCacheBypass(
789  SIAtomicScope Scope,
790  SIAtomicAddrSpace AddrSpace) const {
791  assert(MI->mayLoad() && !MI->mayStore());
792  bool Changed = false;
793 
794  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
795  switch (Scope) {
796  case SIAtomicScope::SYSTEM:
797  case SIAtomicScope::AGENT:
798  // Set L1 cache policy to MISS_EVICT.
799  // Note: there is no L2 cache bypass policy at the ISA level.
800  Changed |= enableGLCBit(MI);
801  break;
802  case SIAtomicScope::WORKGROUP:
803  case SIAtomicScope::WAVEFRONT:
804  case SIAtomicScope::SINGLETHREAD:
805  // No cache to bypass.
806  break;
807  default:
808  llvm_unreachable("Unsupported synchronization scope");
809  }
810  }
811 
812  /// The scratch address space does not need the global memory caches
813  /// to be bypassed as all memory operations by the same thread are
814  /// sequentially consistent, and no other thread can access scratch
815  /// memory.
816 
817  /// Other address spaces do not have a cache.
818 
819  return Changed;
820 }
821 
822 bool SIGfx6CacheControl::enableStoreCacheBypass(
824  SIAtomicScope Scope,
825  SIAtomicAddrSpace AddrSpace) const {
826  assert(!MI->mayLoad() && MI->mayStore());
827  bool Changed = false;
828 
829  /// The L1 cache is write through so does not need to be bypassed. There is no
830  /// bypass control for the L2 cache at the isa level.
831 
832  return Changed;
833 }
834 
835 bool SIGfx6CacheControl::enableRMWCacheBypass(
837  SIAtomicScope Scope,
838  SIAtomicAddrSpace AddrSpace) const {
839  assert(MI->mayLoad() && MI->mayStore());
840  bool Changed = false;
841 
842  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
843  /// bypassed, and the GLC bit is instead used to indicate if they are
844  /// return or no-return.
845  /// Note: there is no L2 cache coherent bypass control at the ISA level.
846 
847  return Changed;
848 }
849 
850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
851  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
852  bool IsVolatile, bool IsNonTemporal) const {
853  // Only handle load and store, not atomic read-modify-write insructions. The
854  // latter use glc to indicate if the atomic returns a result and so must not
855  // be used for cache control.
856  assert(MI->mayLoad() ^ MI->mayStore());
857 
858  // Only update load and store, not LLVM IR atomic read-modify-write
859  // instructions. The latter are always marked as volatile so cannot sensibly
860  // handle it as do not want to pessimize all atomics. Also they do not support
861  // the nontemporal attribute.
863 
864  bool Changed = false;
865 
866  if (IsVolatile) {
867  // Set L1 cache policy to be MISS_EVICT for load instructions
868  // and MISS_LRU for store instructions.
869  // Note: there is no L2 cache bypass policy at the ISA level.
870  if (Op == SIMemOp::LOAD)
871  Changed |= enableGLCBit(MI);
872 
873  // Ensure operation has completed at system scope to cause all volatile
874  // operations to be visible outside the program in a global order. Do not
875  // request cross address space as only the global address space can be
876  // observable outside the program, so no need to cause a waitcnt for LDS
877  // address space operations.
878  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
879  Position::AFTER);
880 
881  return Changed;
882  }
883 
884  if (IsNonTemporal) {
885  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
886  // for both loads and stores, and the L2 cache policy to STREAM.
887  Changed |= enableGLCBit(MI);
888  Changed |= enableSLCBit(MI);
889  return Changed;
890  }
891 
892  return Changed;
893 }
894 
895 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
896  SIAtomicScope Scope,
897  SIAtomicAddrSpace AddrSpace,
898  SIMemOp Op,
899  bool IsCrossAddrSpaceOrdering,
900  Position Pos) const {
901  bool Changed = false;
902 
903  MachineBasicBlock &MBB = *MI->getParent();
904  DebugLoc DL = MI->getDebugLoc();
905 
906  if (Pos == Position::AFTER)
907  ++MI;
908 
909  bool VMCnt = false;
910  bool LGKMCnt = false;
911 
912  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
914  switch (Scope) {
915  case SIAtomicScope::SYSTEM:
916  case SIAtomicScope::AGENT:
917  VMCnt |= true;
918  break;
919  case SIAtomicScope::WORKGROUP:
920  case SIAtomicScope::WAVEFRONT:
921  case SIAtomicScope::SINGLETHREAD:
922  // The L1 cache keeps all memory operations in order for
923  // wavefronts in the same work-group.
924  break;
925  default:
926  llvm_unreachable("Unsupported synchronization scope");
927  }
928  }
929 
930  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
931  switch (Scope) {
932  case SIAtomicScope::SYSTEM:
933  case SIAtomicScope::AGENT:
934  case SIAtomicScope::WORKGROUP:
935  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
936  // not needed as LDS operations for all waves are executed in a total
937  // global ordering as observed by all waves. Required if also
938  // synchronizing with global/GDS memory as LDS operations could be
939  // reordered with respect to later global/GDS memory operations of the
940  // same wave.
941  LGKMCnt |= IsCrossAddrSpaceOrdering;
942  break;
943  case SIAtomicScope::WAVEFRONT:
944  case SIAtomicScope::SINGLETHREAD:
945  // The LDS keeps all memory operations in order for
946  // the same wavesfront.
947  break;
948  default:
949  llvm_unreachable("Unsupported synchronization scope");
950  }
951  }
952 
953  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
954  switch (Scope) {
955  case SIAtomicScope::SYSTEM:
956  case SIAtomicScope::AGENT:
957  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
958  // is not needed as GDS operations for all waves are executed in a total
959  // global ordering as observed by all waves. Required if also
960  // synchronizing with global/LDS memory as GDS operations could be
961  // reordered with respect to later global/LDS memory operations of the
962  // same wave.
963  LGKMCnt |= IsCrossAddrSpaceOrdering;
964  break;
965  case SIAtomicScope::WORKGROUP:
966  case SIAtomicScope::WAVEFRONT:
967  case SIAtomicScope::SINGLETHREAD:
968  // The GDS keeps all memory operations in order for
969  // the same work-group.
970  break;
971  default:
972  llvm_unreachable("Unsupported synchronization scope");
973  }
974  }
975 
976  if (VMCnt || LGKMCnt) {
977  unsigned WaitCntImmediate =
979  VMCnt ? 0 : getVmcntBitMask(IV),
980  getExpcntBitMask(IV),
981  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
982  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
983  Changed = true;
984  }
985 
986  if (Pos == Position::AFTER)
987  --MI;
988 
989  return Changed;
990 }
991 
992 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
993  SIAtomicScope Scope,
994  SIAtomicAddrSpace AddrSpace,
995  Position Pos) const {
996  if (!InsertCacheInv)
997  return false;
998 
999  bool Changed = false;
1000 
1001  MachineBasicBlock &MBB = *MI->getParent();
1002  DebugLoc DL = MI->getDebugLoc();
1003 
1004  if (Pos == Position::AFTER)
1005  ++MI;
1006 
1007  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1008  switch (Scope) {
1009  case SIAtomicScope::SYSTEM:
1010  case SIAtomicScope::AGENT:
1011  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1012  Changed = true;
1013  break;
1014  case SIAtomicScope::WORKGROUP:
1015  case SIAtomicScope::WAVEFRONT:
1016  case SIAtomicScope::SINGLETHREAD:
1017  // No cache to invalidate.
1018  break;
1019  default:
1020  llvm_unreachable("Unsupported synchronization scope");
1021  }
1022  }
1023 
1024  /// The scratch address space does not need the global memory cache
1025  /// to be flushed as all memory operations by the same thread are
1026  /// sequentially consistent, and no other thread can access scratch
1027  /// memory.
1028 
1029  /// Other address spaces do not have a cache.
1030 
1031  if (Pos == Position::AFTER)
1032  --MI;
1033 
1034  return Changed;
1035 }
1036 
1037 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1038  SIAtomicScope Scope,
1039  SIAtomicAddrSpace AddrSpace,
1040  bool IsCrossAddrSpaceOrdering,
1041  Position Pos) const {
1042  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1043  IsCrossAddrSpaceOrdering, Pos);
1044 }
1045 
1046 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1047  SIAtomicScope Scope,
1048  SIAtomicAddrSpace AddrSpace,
1049  Position Pos) const {
1050  if (!InsertCacheInv)
1051  return false;
1052 
1053  bool Changed = false;
1054 
1055  MachineBasicBlock &MBB = *MI->getParent();
1056  DebugLoc DL = MI->getDebugLoc();
1057 
1058  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1059 
1060  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1061  ? AMDGPU::BUFFER_WBINVL1
1062  : AMDGPU::BUFFER_WBINVL1_VOL;
1063 
1064  if (Pos == Position::AFTER)
1065  ++MI;
1066 
1067  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1068  switch (Scope) {
1069  case SIAtomicScope::SYSTEM:
1070  case SIAtomicScope::AGENT:
1071  BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1072  Changed = true;
1073  break;
1074  case SIAtomicScope::WORKGROUP:
1075  case SIAtomicScope::WAVEFRONT:
1076  case SIAtomicScope::SINGLETHREAD:
1077  // No cache to invalidate.
1078  break;
1079  default:
1080  llvm_unreachable("Unsupported synchronization scope");
1081  }
1082  }
1083 
1084  /// The scratch address space does not need the global memory cache
1085  /// to be flushed as all memory operations by the same thread are
1086  /// sequentially consistent, and no other thread can access scratch
1087  /// memory.
1088 
1089  /// Other address spaces do not have a cache.
1090 
1091  if (Pos == Position::AFTER)
1092  --MI;
1093 
1094  return Changed;
1095 }
1096 
1097 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1099  SIAtomicScope Scope,
1100  SIAtomicAddrSpace AddrSpace) const {
1101  assert(MI->mayLoad() && !MI->mayStore());
1102  bool Changed = false;
1103 
1104  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1105  switch (Scope) {
1106  case SIAtomicScope::SYSTEM:
1107  case SIAtomicScope::AGENT:
1108  // Set the L1 cache policy to MISS_LRU.
1109  // Note: there is no L2 cache bypass policy at the ISA level.
1110  Changed |= enableGLCBit(MI);
1111  break;
1112  case SIAtomicScope::WORKGROUP:
1113  // In threadgroup split mode the waves of a work-group can be executing on
1114  // different CUs. Therefore need to bypass the L1 which is per CU.
1115  // Otherwise in non-threadgroup split mode all waves of a work-group are
1116  // on the same CU, and so the L1 does not need to be bypassed.
1117  if (ST.isTgSplitEnabled())
1118  Changed |= enableGLCBit(MI);
1119  break;
1120  case SIAtomicScope::WAVEFRONT:
1121  case SIAtomicScope::SINGLETHREAD:
1122  // No cache to bypass.
1123  break;
1124  default:
1125  llvm_unreachable("Unsupported synchronization scope");
1126  }
1127  }
1128 
1129  /// The scratch address space does not need the global memory caches
1130  /// to be bypassed as all memory operations by the same thread are
1131  /// sequentially consistent, and no other thread can access scratch
1132  /// memory.
1133 
1134  /// Other address spaces do not have a cache.
1135 
1136  return Changed;
1137 }
1138 
1139 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1141  SIAtomicScope Scope,
1142  SIAtomicAddrSpace AddrSpace) const {
1143  assert(!MI->mayLoad() && MI->mayStore());
1144  bool Changed = false;
1145 
1146  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1147  switch (Scope) {
1148  case SIAtomicScope::SYSTEM:
1149  case SIAtomicScope::AGENT:
1150  /// Do not set glc for store atomic operations as they implicitly write
1151  /// through the L1 cache.
1152  break;
1153  case SIAtomicScope::WORKGROUP:
1154  case SIAtomicScope::WAVEFRONT:
1155  case SIAtomicScope::SINGLETHREAD:
1156  // No cache to bypass. Store atomics implicitly write through the L1
1157  // cache.
1158  break;
1159  default:
1160  llvm_unreachable("Unsupported synchronization scope");
1161  }
1162  }
1163 
1164  /// The scratch address space does not need the global memory caches
1165  /// to be bypassed as all memory operations by the same thread are
1166  /// sequentially consistent, and no other thread can access scratch
1167  /// memory.
1168 
1169  /// Other address spaces do not have a cache.
1170 
1171  return Changed;
1172 }
1173 
1174 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1176  SIAtomicScope Scope,
1177  SIAtomicAddrSpace AddrSpace) const {
1178  assert(MI->mayLoad() && MI->mayStore());
1179  bool Changed = false;
1180 
1181  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1182  switch (Scope) {
1183  case SIAtomicScope::SYSTEM:
1184  case SIAtomicScope::AGENT:
1185  /// Do not set glc for RMW atomic operations as they implicitly bypass
1186  /// the L1 cache, and the glc bit is instead used to indicate if they are
1187  /// return or no-return.
1188  break;
1189  case SIAtomicScope::WORKGROUP:
1190  case SIAtomicScope::WAVEFRONT:
1191  case SIAtomicScope::SINGLETHREAD:
1192  // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1193  break;
1194  default:
1195  llvm_unreachable("Unsupported synchronization scope");
1196  }
1197  }
1198 
1199  return Changed;
1200 }
1201 
1202 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1203  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1204  bool IsVolatile, bool IsNonTemporal) const {
1205  // Only handle load and store, not atomic read-modify-write insructions. The
1206  // latter use glc to indicate if the atomic returns a result and so must not
1207  // be used for cache control.
1208  assert(MI->mayLoad() ^ MI->mayStore());
1209 
1210  // Only update load and store, not LLVM IR atomic read-modify-write
1211  // instructions. The latter are always marked as volatile so cannot sensibly
1212  // handle it as do not want to pessimize all atomics. Also they do not support
1213  // the nontemporal attribute.
1215 
1216  bool Changed = false;
1217 
1218  if (IsVolatile) {
1219  // Set L1 cache policy to be MISS_EVICT for load instructions
1220  // and MISS_LRU for store instructions.
1221  // Note: there is no L2 cache bypass policy at the ISA level.
1222  if (Op == SIMemOp::LOAD)
1223  Changed |= enableGLCBit(MI);
1224 
1225  // Ensure operation has completed at system scope to cause all volatile
1226  // operations to be visible outside the program in a global order. Do not
1227  // request cross address space as only the global address space can be
1228  // observable outside the program, so no need to cause a waitcnt for LDS
1229  // address space operations.
1230  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1231  Position::AFTER);
1232 
1233  return Changed;
1234  }
1235 
1236  if (IsNonTemporal) {
1237  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1238  // for both loads and stores, and the L2 cache policy to STREAM.
1239  Changed |= enableGLCBit(MI);
1240  Changed |= enableSLCBit(MI);
1241  return Changed;
1242  }
1243 
1244  return Changed;
1245 }
1246 
1247 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1248  SIAtomicScope Scope,
1249  SIAtomicAddrSpace AddrSpace,
1250  SIMemOp Op,
1251  bool IsCrossAddrSpaceOrdering,
1252  Position Pos) const {
1253  if (ST.isTgSplitEnabled()) {
1254  // In threadgroup split mode the waves of a work-group can be executing on
1255  // different CUs. Therefore need to wait for global or GDS memory operations
1256  // to complete to ensure they are visible to waves in the other CUs.
1257  // Otherwise in non-threadgroup split mode all waves of a work-group are on
1258  // the same CU, so no need to wait for global memory as all waves in the
1259  // work-group access the same the L1, nor wait for GDS as access are ordered
1260  // on a CU.
1261  if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1262  SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1263  (Scope == SIAtomicScope::WORKGROUP)) {
1264  // Same as GFX7 using agent scope.
1265  Scope = SIAtomicScope::AGENT;
1266  }
1267  // In threadgroup split mode LDS cannot be allocated so no need to wait for
1268  // LDS memory operations.
1269  AddrSpace &= ~SIAtomicAddrSpace::LDS;
1270  }
1271  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1272  IsCrossAddrSpaceOrdering, Pos);
1273 }
1274 
1275 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1276  SIAtomicScope Scope,
1277  SIAtomicAddrSpace AddrSpace,
1278  Position Pos) const {
1279  if (!InsertCacheInv)
1280  return false;
1281 
1282  bool Changed = false;
1283 
1284  MachineBasicBlock &MBB = *MI->getParent();
1285  DebugLoc DL = MI->getDebugLoc();
1286 
1287  if (Pos == Position::AFTER)
1288  ++MI;
1289 
1290  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1291  switch (Scope) {
1292  case SIAtomicScope::SYSTEM:
1293  // Ensures that following loads will not see stale remote VMEM data or
1294  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1295  // CC will never be stale due to the local memory probes.
1296  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1297  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1298  // hardware does not reorder memory operations by the same wave with
1299  // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1300  // remove any cache lines of earlier writes by the same wave and ensures
1301  // later reads by the same wave will refetch the cache lines.
1302  Changed = true;
1303  break;
1304  case SIAtomicScope::AGENT:
1305  // Same as GFX7.
1306  break;
1307  case SIAtomicScope::WORKGROUP:
1308  // In threadgroup split mode the waves of a work-group can be executing on
1309  // different CUs. Therefore need to invalidate the L1 which is per CU.
1310  // Otherwise in non-threadgroup split mode all waves of a work-group are
1311  // on the same CU, and so the L1 does not need to be invalidated.
1312  if (ST.isTgSplitEnabled()) {
1313  // Same as GFX7 using agent scope.
1314  Scope = SIAtomicScope::AGENT;
1315  }
1316  break;
1317  case SIAtomicScope::WAVEFRONT:
1318  case SIAtomicScope::SINGLETHREAD:
1319  // Same as GFX7.
1320  break;
1321  default:
1322  llvm_unreachable("Unsupported synchronization scope");
1323  }
1324  }
1325 
1326  /// The scratch address space does not need the global memory cache
1327  /// to be flushed as all memory operations by the same thread are
1328  /// sequentially consistent, and no other thread can access scratch
1329  /// memory.
1330 
1331  /// Other address spaces do not have a cache.
1332 
1333  if (Pos == Position::AFTER)
1334  --MI;
1335 
1336  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1337 
1338  return Changed;
1339 }
1340 
1341 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1342  SIAtomicScope Scope,
1343  SIAtomicAddrSpace AddrSpace,
1344  bool IsCrossAddrSpaceOrdering,
1345  Position Pos) const {
1346  bool Changed = false;
1347 
1348  MachineBasicBlock &MBB = *MI->getParent();
1349  DebugLoc DL = MI->getDebugLoc();
1350 
1351  if (Pos == Position::AFTER)
1352  ++MI;
1353 
1354  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1355  switch (Scope) {
1356  case SIAtomicScope::SYSTEM:
1357  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1358  // hardware does not reorder memory operations by the same wave with
1359  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1360  // to initiate writeback of any dirty cache lines of earlier writes by the
1361  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1362  // writeback has completed.
1363  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1364  // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1365  // vmcnt(0)" needed by the "BUFFER_WBL2".
1366  Changed = true;
1367  break;
1368  case SIAtomicScope::AGENT:
1369  case SIAtomicScope::WORKGROUP:
1370  case SIAtomicScope::WAVEFRONT:
1371  case SIAtomicScope::SINGLETHREAD:
1372  // Same as GFX7.
1373  break;
1374  default:
1375  llvm_unreachable("Unsupported synchronization scope");
1376  }
1377  }
1378 
1379  if (Pos == Position::AFTER)
1380  --MI;
1381 
1382  Changed |=
1383  SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1384  IsCrossAddrSpaceOrdering, Pos);
1385 
1386  return Changed;
1387 }
1388 
1389 bool SIGfx10CacheControl::enableLoadCacheBypass(
1391  SIAtomicScope Scope,
1392  SIAtomicAddrSpace AddrSpace) const {
1393  assert(MI->mayLoad() && !MI->mayStore());
1394  bool Changed = false;
1395 
1396  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1397  switch (Scope) {
1398  case SIAtomicScope::SYSTEM:
1399  case SIAtomicScope::AGENT:
1400  // Set the L0 and L1 cache policies to MISS_EVICT.
1401  // Note: there is no L2 cache coherent bypass control at the ISA level.
1402  Changed |= enableGLCBit(MI);
1403  Changed |= enableDLCBit(MI);
1404  break;
1405  case SIAtomicScope::WORKGROUP:
1406  // In WGP mode the waves of a work-group can be executing on either CU of
1407  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1408  // CU mode all waves of a work-group are on the same CU, and so the L0
1409  // does not need to be bypassed.
1410  if (!ST.isCuModeEnabled())
1411  Changed |= enableGLCBit(MI);
1412  break;
1413  case SIAtomicScope::WAVEFRONT:
1414  case SIAtomicScope::SINGLETHREAD:
1415  // No cache to bypass.
1416  break;
1417  default:
1418  llvm_unreachable("Unsupported synchronization scope");
1419  }
1420  }
1421 
1422  /// The scratch address space does not need the global memory caches
1423  /// to be bypassed as all memory operations by the same thread are
1424  /// sequentially consistent, and no other thread can access scratch
1425  /// memory.
1426 
1427  /// Other address spaces do not have a cache.
1428 
1429  return Changed;
1430 }
1431 
1432 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1433  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1434  bool IsVolatile, bool IsNonTemporal) const {
1435 
1436  // Only handle load and store, not atomic read-modify-write insructions. The
1437  // latter use glc to indicate if the atomic returns a result and so must not
1438  // be used for cache control.
1439  assert(MI->mayLoad() ^ MI->mayStore());
1440 
1441  // Only update load and store, not LLVM IR atomic read-modify-write
1442  // instructions. The latter are always marked as volatile so cannot sensibly
1443  // handle it as do not want to pessimize all atomics. Also they do not support
1444  // the nontemporal attribute.
1446 
1447  bool Changed = false;
1448 
1449  if (IsVolatile) {
1450  // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1451  // and MISS_LRU for store instructions.
1452  // Note: there is no L2 cache coherent bypass control at the ISA level.
1453  if (Op == SIMemOp::LOAD) {
1454  Changed |= enableGLCBit(MI);
1455  Changed |= enableDLCBit(MI);
1456  }
1457 
1458  // Ensure operation has completed at system scope to cause all volatile
1459  // operations to be visible outside the program in a global order. Do not
1460  // request cross address space as only the global address space can be
1461  // observable outside the program, so no need to cause a waitcnt for LDS
1462  // address space operations.
1463  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1464  Position::AFTER);
1465  return Changed;
1466  }
1467 
1468  if (IsNonTemporal) {
1469  // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1470  // and L2 cache policy to STREAM.
1471  // For stores setting both GLC and SLC configures L0 and L1 cache policy
1472  // to MISS_EVICT and the L2 cache policy to STREAM.
1473  if (Op == SIMemOp::STORE)
1474  Changed |= enableGLCBit(MI);
1475  Changed |= enableSLCBit(MI);
1476 
1477  return Changed;
1478  }
1479 
1480  return Changed;
1481 }
1482 
1483 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1484  SIAtomicScope Scope,
1485  SIAtomicAddrSpace AddrSpace,
1486  SIMemOp Op,
1487  bool IsCrossAddrSpaceOrdering,
1488  Position Pos) const {
1489  bool Changed = false;
1490 
1491  MachineBasicBlock &MBB = *MI->getParent();
1492  DebugLoc DL = MI->getDebugLoc();
1493 
1494  if (Pos == Position::AFTER)
1495  ++MI;
1496 
1497  bool VMCnt = false;
1498  bool VSCnt = false;
1499  bool LGKMCnt = false;
1500 
1501  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1503  switch (Scope) {
1504  case SIAtomicScope::SYSTEM:
1505  case SIAtomicScope::AGENT:
1506  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1507  VMCnt |= true;
1508  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1509  VSCnt |= true;
1510  break;
1511  case SIAtomicScope::WORKGROUP:
1512  // In WGP mode the waves of a work-group can be executing on either CU of
1513  // the WGP. Therefore need to wait for operations to complete to ensure
1514  // they are visible to waves in the other CU as the L0 is per CU.
1515  // Otherwise in CU mode and all waves of a work-group are on the same CU
1516  // which shares the same L0.
1517  if (!ST.isCuModeEnabled()) {
1518  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1519  VMCnt |= true;
1520  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1521  VSCnt |= true;
1522  }
1523  break;
1524  case SIAtomicScope::WAVEFRONT:
1525  case SIAtomicScope::SINGLETHREAD:
1526  // The L0 cache keeps all memory operations in order for
1527  // work-items in the same wavefront.
1528  break;
1529  default:
1530  llvm_unreachable("Unsupported synchronization scope");
1531  }
1532  }
1533 
1534  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1535  switch (Scope) {
1536  case SIAtomicScope::SYSTEM:
1537  case SIAtomicScope::AGENT:
1538  case SIAtomicScope::WORKGROUP:
1539  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1540  // not needed as LDS operations for all waves are executed in a total
1541  // global ordering as observed by all waves. Required if also
1542  // synchronizing with global/GDS memory as LDS operations could be
1543  // reordered with respect to later global/GDS memory operations of the
1544  // same wave.
1545  LGKMCnt |= IsCrossAddrSpaceOrdering;
1546  break;
1547  case SIAtomicScope::WAVEFRONT:
1548  case SIAtomicScope::SINGLETHREAD:
1549  // The LDS keeps all memory operations in order for
1550  // the same wavesfront.
1551  break;
1552  default:
1553  llvm_unreachable("Unsupported synchronization scope");
1554  }
1555  }
1556 
1557  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1558  switch (Scope) {
1559  case SIAtomicScope::SYSTEM:
1560  case SIAtomicScope::AGENT:
1561  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1562  // is not needed as GDS operations for all waves are executed in a total
1563  // global ordering as observed by all waves. Required if also
1564  // synchronizing with global/LDS memory as GDS operations could be
1565  // reordered with respect to later global/LDS memory operations of the
1566  // same wave.
1567  LGKMCnt |= IsCrossAddrSpaceOrdering;
1568  break;
1569  case SIAtomicScope::WORKGROUP:
1570  case SIAtomicScope::WAVEFRONT:
1571  case SIAtomicScope::SINGLETHREAD:
1572  // The GDS keeps all memory operations in order for
1573  // the same work-group.
1574  break;
1575  default:
1576  llvm_unreachable("Unsupported synchronization scope");
1577  }
1578  }
1579 
1580  if (VMCnt || LGKMCnt) {
1581  unsigned WaitCntImmediate =
1583  VMCnt ? 0 : getVmcntBitMask(IV),
1584  getExpcntBitMask(IV),
1585  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1586  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1587  Changed = true;
1588  }
1589 
1590  if (VSCnt) {
1591  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1592  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1593  .addImm(0);
1594  Changed = true;
1595  }
1596 
1597  if (Pos == Position::AFTER)
1598  --MI;
1599 
1600  return Changed;
1601 }
1602 
1603 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1604  SIAtomicScope Scope,
1605  SIAtomicAddrSpace AddrSpace,
1606  Position Pos) const {
1607  if (!InsertCacheInv)
1608  return false;
1609 
1610  bool Changed = false;
1611 
1612  MachineBasicBlock &MBB = *MI->getParent();
1613  DebugLoc DL = MI->getDebugLoc();
1614 
1615  if (Pos == Position::AFTER)
1616  ++MI;
1617 
1618  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1619  switch (Scope) {
1620  case SIAtomicScope::SYSTEM:
1621  case SIAtomicScope::AGENT:
1622  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1623  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1624  Changed = true;
1625  break;
1626  case SIAtomicScope::WORKGROUP:
1627  // In WGP mode the waves of a work-group can be executing on either CU of
1628  // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1629  // in CU mode and all waves of a work-group are on the same CU, and so the
1630  // L0 does not need to be invalidated.
1631  if (!ST.isCuModeEnabled()) {
1632  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1633  Changed = true;
1634  }
1635  break;
1636  case SIAtomicScope::WAVEFRONT:
1637  case SIAtomicScope::SINGLETHREAD:
1638  // No cache to invalidate.
1639  break;
1640  default:
1641  llvm_unreachable("Unsupported synchronization scope");
1642  }
1643  }
1644 
1645  /// The scratch address space does not need the global memory cache
1646  /// to be flushed as all memory operations by the same thread are
1647  /// sequentially consistent, and no other thread can access scratch
1648  /// memory.
1649 
1650  /// Other address spaces do not have a cache.
1651 
1652  if (Pos == Position::AFTER)
1653  --MI;
1654 
1655  return Changed;
1656 }
1657 
1658 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1659  if (AtomicPseudoMIs.empty())
1660  return false;
1661 
1662  for (auto &MI : AtomicPseudoMIs)
1663  MI->eraseFromParent();
1664 
1665  AtomicPseudoMIs.clear();
1666  return true;
1667 }
1668 
1669 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1671  assert(MI->mayLoad() && !MI->mayStore());
1672 
1673  bool Changed = false;
1674 
1675  if (MOI.isAtomic()) {
1676  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1677  MOI.getOrdering() == AtomicOrdering::Acquire ||
1678  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1679  Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1680  MOI.getOrderingAddrSpace());
1681  }
1682 
1683  if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1684  Changed |= CC->insertWait(MI, MOI.getScope(),
1685  MOI.getOrderingAddrSpace(),
1687  MOI.getIsCrossAddressSpaceOrdering(),
1688  Position::BEFORE);
1689 
1690  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1691  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1692  Changed |= CC->insertWait(MI, MOI.getScope(),
1693  MOI.getInstrAddrSpace(),
1694  SIMemOp::LOAD,
1695  MOI.getIsCrossAddressSpaceOrdering(),
1696  Position::AFTER);
1697  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1698  MOI.getOrderingAddrSpace(),
1699  Position::AFTER);
1700  }
1701 
1702  return Changed;
1703  }
1704 
1705  // Atomic instructions already bypass caches to the scope specified by the
1706  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1707  // need additional treatment.
1708  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1709  SIMemOp::LOAD, MOI.isVolatile(),
1710  MOI.isNonTemporal());
1711  return Changed;
1712 }
1713 
1714 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1716  assert(!MI->mayLoad() && MI->mayStore());
1717 
1718  bool Changed = false;
1719 
1720  if (MOI.isAtomic()) {
1721  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1722  MOI.getOrdering() == AtomicOrdering::Release ||
1723  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1724  Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1725  MOI.getOrderingAddrSpace());
1726  }
1727 
1728  if (MOI.getOrdering() == AtomicOrdering::Release ||
1729  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1730  Changed |= CC->insertRelease(MI, MOI.getScope(),
1731  MOI.getOrderingAddrSpace(),
1732  MOI.getIsCrossAddressSpaceOrdering(),
1733  Position::BEFORE);
1734 
1735  return Changed;
1736  }
1737 
1738  // Atomic instructions already bypass caches to the scope specified by the
1739  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1740  // need additional treatment.
1741  Changed |= CC->enableVolatileAndOrNonTemporal(
1742  MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1743  MOI.isNonTemporal());
1744  return Changed;
1745 }
1746 
1747 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1749  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1750 
1751  AtomicPseudoMIs.push_back(MI);
1752  bool Changed = false;
1753 
1754  if (MOI.isAtomic()) {
1755  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1756  MOI.getOrdering() == AtomicOrdering::Release ||
1757  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1758  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1759  /// TODO: This relies on a barrier always generating a waitcnt
1760  /// for LDS to ensure it is not reordered with the completion of
1761  /// the proceeding LDS operations. If barrier had a memory
1762  /// ordering and memory scope, then library does not need to
1763  /// generate a fence. Could add support in this file for
1764  /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1765  /// adding S_WAITCNT before a S_BARRIER.
1766  Changed |= CC->insertRelease(MI, MOI.getScope(),
1767  MOI.getOrderingAddrSpace(),
1768  MOI.getIsCrossAddressSpaceOrdering(),
1769  Position::BEFORE);
1770 
1771  // TODO: If both release and invalidate are happening they could be combined
1772  // to use the single "BUFFER_WBINV*" instruction. This could be done by
1773  // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1774  // track cache invalidate and write back instructions.
1775 
1776  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1777  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1778  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1779  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1780  MOI.getOrderingAddrSpace(),
1781  Position::BEFORE);
1782 
1783  return Changed;
1784  }
1785 
1786  return Changed;
1787 }
1788 
1789 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1791  assert(MI->mayLoad() && MI->mayStore());
1792 
1793  bool Changed = false;
1794 
1795  if (MOI.isAtomic()) {
1796  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1797  MOI.getOrdering() == AtomicOrdering::Acquire ||
1798  MOI.getOrdering() == AtomicOrdering::Release ||
1799  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1800  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1801  Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1802  MOI.getInstrAddrSpace());
1803  }
1804 
1805  if (MOI.getOrdering() == AtomicOrdering::Release ||
1806  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1807  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1808  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1809  Changed |= CC->insertRelease(MI, MOI.getScope(),
1810  MOI.getOrderingAddrSpace(),
1811  MOI.getIsCrossAddressSpaceOrdering(),
1812  Position::BEFORE);
1813 
1814  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1815  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1816  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1817  MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1818  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1819  Changed |= CC->insertWait(MI, MOI.getScope(),
1820  MOI.getInstrAddrSpace(),
1821  isAtomicRet(*MI) ? SIMemOp::LOAD :
1823  MOI.getIsCrossAddressSpaceOrdering(),
1824  Position::AFTER);
1825  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1826  MOI.getOrderingAddrSpace(),
1827  Position::AFTER);
1828  }
1829 
1830  return Changed;
1831  }
1832 
1833  return Changed;
1834 }
1835 
1836 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1837  bool Changed = false;
1838 
1839  SIMemOpAccess MOA(MF);
1840  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1841 
1842  for (auto &MBB : MF) {
1843  for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1844 
1845  // Unbundle instructions after the post-RA scheduler.
1846  if (MI->isBundle() && MI->mayLoadOrStore()) {
1847  MachineBasicBlock::instr_iterator II(MI->getIterator());
1849  I != E && I->isBundledWithPred(); ++I) {
1850  I->unbundleFromPred();
1851  for (MachineOperand &MO : I->operands())
1852  if (MO.isReg())
1853  MO.setIsInternalRead(false);
1854  }
1855 
1856  MI->eraseFromParent();
1857  MI = II->getIterator();
1858  }
1859 
1860  if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1861  continue;
1862 
1863  if (const auto &MOI = MOA.getLoadInfo(MI))
1864  Changed |= expandLoad(MOI.getValue(), MI);
1865  else if (const auto &MOI = MOA.getStoreInfo(MI))
1866  Changed |= expandStore(MOI.getValue(), MI);
1867  else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1868  Changed |= expandAtomicFence(MOI.getValue(), MI);
1869  else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1870  Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1871  }
1872  }
1873 
1874  Changed |= removeAtomicPseudoMIs();
1875  return Changed;
1876 }
1877 
1878 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1879 
1880 char SIMemoryLegalizer::ID = 0;
1881 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1882 
1884  return new SIMemoryLegalizer();
1885 }
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:564
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:22
GFX10
@ GFX10
Definition: SIInstrInfo.cpp:7700
llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition: SIMemoryLegalizer.cpp:1883
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1004
AtomicOrdering.h
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:187
llvm::Function
Definition: Function.h:62
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:59
llvm::ALL
@ ALL
Definition: Attributor.h:4794
llvm::getMergedAtomicOrdering
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
Definition: AtomicOrdering.h:138
AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
MachineBasicBlock.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
isAtomic
static bool isAtomic(Instruction *I)
Definition: ThreadSanitizer.cpp:530
llvm::Optional
Definition: APInt.h:33
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:896
getLoadInfo
static Optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition: AArch64FalkorHWPFFix.cpp:237
TargetParser.h
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:106
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:292
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPU::CPol::DLC
@ DLC
Definition: SIDefines.h:295
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:126
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:359
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:34
llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition: SIDefines.h:79
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:364
llvm::None
const NoneType None
Definition: None.h:23
llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition: SIMemoryLegalizer.cpp:1881
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::MachineFunction::getMMI
MachineModuleInfo & getMMI() const
Definition: MachineFunction.h:592
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:641
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt< bool >
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:905
AMDGPUMCTargetDesc.h
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:293
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::ISD::ATOMIC_FENCE
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1114
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:925
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::SyncScope::ID
uint8_t ID
Definition: LLVMContext.h:47
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
llvm::AMDGPUISD::LDS
@ LDS
Definition: AMDGPUISelLowering.h:485
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
AMDGPUMachineModuleInfo.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:229
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIMemoryLegalizer.cpp:29
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:258
llvm::MachineFunction
Definition: MachineFunction.h:241
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
AMDGPU.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
BitmaskEnum.h
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::AtomicOrdering::Release
@ Release
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:984
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:325
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPU::CPol::SLC
@ SLC
Definition: SIDefines.h:294
DiagnosticInfo.h
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:926
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:909
llvm::NVPTX::PTXLdStInstCode::GLOBAL
@ GLOBAL
Definition: NVPTX.h:109
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::NONE
@ NONE
Definition: Attributor.h:4791
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:272
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::tgtok::Bit
@ Bit
Definition: TGLexer.h:50
PASS_NAME
#define PASS_NAME
Definition: SIMemoryLegalizer.cpp:30
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUMachineModuleInfo
Definition: AMDGPUMachineModuleInfo.h:22
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:412
llvm::MachineInstrBundleIterator< MachineInstr >
LDS
AMDGPU promote alloca to vector or LDS
Definition: AMDGPUPromoteAlloca.cpp:136
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:274
llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
Definition: MachineModuleInfo.h:190
llvm::AtomicOrdering::NotAtomic
@ NotAtomic
llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:122
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360