Line data Source code
1 : //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// Memory legalizer - implements memory model. More information can be
12 : /// found here:
13 : /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
14 : //
15 : //===----------------------------------------------------------------------===//
16 :
17 : #include "AMDGPU.h"
18 : #include "AMDGPUMachineModuleInfo.h"
19 : #include "AMDGPUSubtarget.h"
20 : #include "SIDefines.h"
21 : #include "SIInstrInfo.h"
22 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 : #include "Utils/AMDGPUBaseInfo.h"
24 : #include "llvm/ADT/BitmaskEnum.h"
25 : #include "llvm/ADT/None.h"
26 : #include "llvm/ADT/Optional.h"
27 : #include "llvm/CodeGen/MachineBasicBlock.h"
28 : #include "llvm/CodeGen/MachineFunction.h"
29 : #include "llvm/CodeGen/MachineFunctionPass.h"
30 : #include "llvm/CodeGen/MachineInstrBuilder.h"
31 : #include "llvm/CodeGen/MachineMemOperand.h"
32 : #include "llvm/CodeGen/MachineModuleInfo.h"
33 : #include "llvm/CodeGen/MachineOperand.h"
34 : #include "llvm/IR/DebugLoc.h"
35 : #include "llvm/IR/DiagnosticInfo.h"
36 : #include "llvm/IR/Function.h"
37 : #include "llvm/IR/LLVMContext.h"
38 : #include "llvm/MC/MCInstrDesc.h"
39 : #include "llvm/Pass.h"
40 : #include "llvm/Support/AtomicOrdering.h"
41 : #include "llvm/Support/MathExtras.h"
42 : #include <cassert>
43 : #include <list>
44 :
45 : using namespace llvm;
46 : using namespace llvm::AMDGPU;
47 :
48 : #define DEBUG_TYPE "si-memory-legalizer"
49 : #define PASS_NAME "SI Memory Legalizer"
50 :
51 : namespace {
52 :
53 : LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
54 :
55 : /// Memory operation flags. Can be ORed together.
56 : enum class SIMemOp {
57 : NONE = 0u,
58 : LOAD = 1u << 0,
59 : STORE = 1u << 1,
60 : LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
61 : };
62 :
63 : /// Position to insert a new instruction relative to an existing
64 : /// instruction.
65 : enum class Position {
66 : BEFORE,
67 : AFTER
68 : };
69 :
70 : /// The atomic synchronization scopes supported by the AMDGPU target.
71 : enum class SIAtomicScope {
72 : NONE,
73 : SINGLETHREAD,
74 : WAVEFRONT,
75 : WORKGROUP,
76 : AGENT,
77 : SYSTEM
78 : };
79 :
80 : /// The distinct address spaces supported by the AMDGPU target for
81 : /// atomic memory operation. Can be ORed toether.
82 : enum class SIAtomicAddrSpace {
83 : NONE = 0u,
84 : GLOBAL = 1u << 0,
85 : LDS = 1u << 1,
86 : SCRATCH = 1u << 2,
87 : GDS = 1u << 3,
88 : OTHER = 1u << 4,
89 :
90 : /// The address spaces that can be accessed by a FLAT instruction.
91 : FLAT = GLOBAL | LDS | SCRATCH,
92 :
93 : /// The address spaces that support atomic instructions.
94 : ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
95 :
96 : /// All address spaces.
97 : ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
98 :
99 : LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
100 : };
101 :
102 : /// Sets named bit \p BitName to "true" if present in instruction \p MI.
103 : /// \returns Returns true if \p MI is modified, false otherwise.
104 : template <uint16_t BitName>
105 173 : bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
106 173 : int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
107 173 : if (BitIdx == -1)
108 : return false;
109 :
110 173 : MachineOperand &Bit = MI->getOperand(BitIdx);
111 173 : if (Bit.getImm() != 0)
112 : return false;
113 :
114 : Bit.setImm(1);
115 159 : return true;
116 : }
117 45 :
118 45 : class SIMemOpInfo final {
119 45 : private:
120 :
121 : friend class SIMemOpAccess;
122 45 :
123 45 : AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
124 : AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
125 : SIAtomicScope Scope = SIAtomicScope::SYSTEM;
126 : SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
127 45 : SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
128 : bool IsCrossAddressSpaceOrdering = false;
129 128 : bool IsNonTemporal = false;
130 128 :
131 128 : SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
132 : SIAtomicScope Scope = SIAtomicScope::SYSTEM,
133 : SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
134 128 : SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
135 128 : bool IsCrossAddressSpaceOrdering = true,
136 : AtomicOrdering FailureOrdering =
137 : AtomicOrdering::SequentiallyConsistent,
138 : bool IsNonTemporal = false)
139 114 : : Ordering(Ordering), FailureOrdering(FailureOrdering),
140 : Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
141 : InstrAddrSpace(InstrAddrSpace),
142 : IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
143 : IsNonTemporal(IsNonTemporal) {
144 : // There is also no cross address space ordering if the ordering
145 : // address space is the same as the instruction address space and
146 : // only contains a single address space.
147 : if ((OrderingAddrSpace == InstrAddrSpace) &&
148 : isPowerOf2_32(uint32_t(InstrAddrSpace)))
149 : IsCrossAddressSpaceOrdering = false;
150 : }
151 :
152 : public:
153 : /// \returns Atomic synchronization scope of the machine instruction used to
154 : /// create this SIMemOpInfo.
155 : SIAtomicScope getScope() const {
156 : return Scope;
157 : }
158 :
159 : /// \returns Ordering constraint of the machine instruction used to
160 : /// create this SIMemOpInfo.
161 : AtomicOrdering getOrdering() const {
162 : return Ordering;
163 : }
164 :
165 : /// \returns Failure ordering constraint of the machine instruction used to
166 : /// create this SIMemOpInfo.
167 76 : AtomicOrdering getFailureOrdering() const {
168 : return FailureOrdering;
169 : }
170 :
171 : /// \returns The address spaces be accessed by the machine
172 : /// instruction used to create this SiMemOpInfo.
173 : SIAtomicAddrSpace getInstrAddrSpace() const {
174 : return InstrAddrSpace;
175 : }
176 :
177 : /// \returns The address spaces that must be ordered by the machine
178 : /// instruction used to create this SiMemOpInfo.
179 0 : SIAtomicAddrSpace getOrderingAddrSpace() const {
180 0 : return OrderingAddrSpace;
181 : }
182 :
183 : /// \returns Return true iff memory ordering of operations on
184 : /// different address spaces is required.
185 0 : bool getIsCrossAddressSpaceOrdering() const {
186 0 : return IsCrossAddressSpaceOrdering;
187 : }
188 :
189 : /// \returns True if memory access of the machine instruction used to
190 : /// create this SIMemOpInfo is non-temporal, false otherwise.
191 0 : bool isNonTemporal() const {
192 0 : return IsNonTemporal;
193 : }
194 :
195 : /// \returns True if ordering constraint of the machine instruction used to
196 : /// create this SIMemOpInfo is unordered or higher, false otherwise.
197 0 : bool isAtomic() const {
198 0 : return Ordering != AtomicOrdering::NotAtomic;
199 : }
200 :
201 : };
202 :
203 0 : class SIMemOpAccess final {
204 0 : private:
205 : AMDGPUMachineModuleInfo *MMI = nullptr;
206 :
207 : /// Reports unsupported message \p Msg for \p MI to LLVM context.
208 : void reportUnsupported(const MachineBasicBlock::iterator &MI,
209 0 : const char *Msg) const;
210 0 :
211 : /// Inspects the target synchonization scope \p SSID and determines
212 : /// the SI atomic scope it corresponds to, the address spaces it
213 : /// covers, and whether the memory ordering applies between address
214 : /// spaces.
215 0 : Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
216 0 : toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
217 :
218 : /// \return Return a bit set of the address spaces accessed by \p AS.
219 : SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
220 :
221 0 : /// \returns Info constructed from \p MI, which has at least machine memory
222 0 : /// operand.
223 : Optional<SIMemOpInfo> constructFromMIWithMMO(
224 : const MachineBasicBlock::iterator &MI) const;
225 :
226 : public:
227 : /// Construct class to support accessing the machine memory operands
228 : /// of instructions in the machine function \p MF.
229 : SIMemOpAccess(MachineFunction &MF);
230 :
231 : /// \returns Load info if \p MI is a load operation, "None" otherwise.
232 : Optional<SIMemOpInfo> getLoadInfo(
233 : const MachineBasicBlock::iterator &MI) const;
234 :
235 : /// \returns Store info if \p MI is a store operation, "None" otherwise.
236 : Optional<SIMemOpInfo> getStoreInfo(
237 : const MachineBasicBlock::iterator &MI) const;
238 :
239 : /// \returns Atomic fence info if \p MI is an atomic fence operation,
240 : /// "None" otherwise.
241 : Optional<SIMemOpInfo> getAtomicFenceInfo(
242 : const MachineBasicBlock::iterator &MI) const;
243 :
244 : /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
245 : /// rmw operation, "None" otherwise.
246 : Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
247 : const MachineBasicBlock::iterator &MI) const;
248 : };
249 :
250 : class SICacheControl {
251 : protected:
252 :
253 : /// Instruction info.
254 : const SIInstrInfo *TII = nullptr;
255 :
256 : IsaVersion IV;
257 :
258 : SICacheControl(const GCNSubtarget &ST);
259 :
260 : public:
261 :
262 : /// Create a cache control for the subtarget \p ST.
263 : static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
264 :
265 : /// Update \p MI memory load instruction to bypass any caches up to
266 : /// the \p Scope memory scope for address spaces \p
267 : /// AddrSpace. Return true iff the instruction was modified.
268 : virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
269 : SIAtomicScope Scope,
270 : SIAtomicAddrSpace AddrSpace) const = 0;
271 :
272 : /// Update \p MI memory instruction to indicate it is
273 : /// nontemporal. Return true iff the instruction was modified.
274 : virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
275 : const = 0;
276 :
277 : /// Inserts any necessary instructions at position \p Pos relative
278 : /// to instruction \p MI to ensure any caches associated with
279 : /// address spaces \p AddrSpace for memory scopes up to memory scope
280 : /// \p Scope are invalidated. Returns true iff any instructions
281 : /// inserted.
282 : virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
283 : SIAtomicScope Scope,
284 : SIAtomicAddrSpace AddrSpace,
285 : Position Pos) const = 0;
286 :
287 : /// Inserts any necessary instructions at position \p Pos relative
288 : /// to instruction \p MI to ensure memory instructions of kind \p Op
289 : /// associated with address spaces \p AddrSpace have completed as
290 : /// observed by other memory instructions executing in memory scope
291 : /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
292 : /// ordering is between address spaces. Returns true iff any
293 : /// instructions inserted.
294 : virtual bool insertWait(MachineBasicBlock::iterator &MI,
295 : SIAtomicScope Scope,
296 : SIAtomicAddrSpace AddrSpace,
297 : SIMemOp Op,
298 : bool IsCrossAddrSpaceOrdering,
299 : Position Pos) const = 0;
300 :
301 : /// Virtual destructor to allow derivations to be deleted.
302 : virtual ~SICacheControl() = default;
303 :
304 : };
305 :
306 : class SIGfx6CacheControl : public SICacheControl {
307 : protected:
308 :
309 : /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
310 : /// is modified, false otherwise.
311 : bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
312 : return enableNamedBit<AMDGPU::OpName::glc>(MI);
313 : }
314 :
315 : /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
316 : /// is modified, false otherwise.
317 : bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
318 : return enableNamedBit<AMDGPU::OpName::slc>(MI);
319 : }
320 :
321 : public:
322 :
323 : SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
324 :
325 : bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
326 : SIAtomicScope Scope,
327 : SIAtomicAddrSpace AddrSpace) const override;
328 :
329 : bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
330 :
331 : bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
332 : SIAtomicScope Scope,
333 : SIAtomicAddrSpace AddrSpace,
334 : Position Pos) const override;
335 0 :
336 128 : bool insertWait(MachineBasicBlock::iterator &MI,
337 : SIAtomicScope Scope,
338 : SIAtomicAddrSpace AddrSpace,
339 : SIMemOp Op,
340 : bool IsCrossAddrSpaceOrdering,
341 0 : Position Pos) const override;
342 45 : };
343 :
344 : class SIGfx7CacheControl : public SIGfx6CacheControl {
345 : public:
346 :
347 19828 : SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
348 :
349 : bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
350 : SIAtomicScope Scope,
351 : SIAtomicAddrSpace AddrSpace,
352 : Position Pos) const override;
353 :
354 : };
355 :
356 : class SIMemoryLegalizer final : public MachineFunctionPass {
357 : private:
358 :
359 : /// Cache Control.
360 : std::unique_ptr<SICacheControl> CC = nullptr;
361 :
362 : /// List of atomic pseudo instructions.
363 : std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
364 :
365 : /// Return true iff instruction \p MI is a atomic instruction that
366 : /// returns a result.
367 : bool isAtomicRet(const MachineInstr &MI) const {
368 0 : return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
369 : }
370 :
371 13826 : /// Removes all processed atomic pseudo instructions from the current
372 : /// function. Returns true if current function is modified, false otherwise.
373 : bool removeAtomicPseudoMIs();
374 :
375 : /// Expands load operation \p MI. Returns true if instructions are
376 : /// added/deleted or \p MI is modified, false otherwise.
377 : bool expandLoad(const SIMemOpInfo &MOI,
378 : MachineBasicBlock::iterator &MI);
379 : /// Expands store operation \p MI. Returns true if instructions are
380 : /// added/deleted or \p MI is modified, false otherwise.
381 : bool expandStore(const SIMemOpInfo &MOI,
382 : MachineBasicBlock::iterator &MI);
383 : /// Expands atomic fence operation \p MI. Returns true if
384 : /// instructions are added/deleted or \p MI is modified, false otherwise.
385 : bool expandAtomicFence(const SIMemOpInfo &MOI,
386 : MachineBasicBlock::iterator &MI);
387 : /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
388 : /// instructions are added/deleted or \p MI is modified, false otherwise.
389 : bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
390 : MachineBasicBlock::iterator &MI);
391 0 :
392 1140 : public:
393 : static char ID;
394 :
395 : SIMemoryLegalizer() : MachineFunctionPass(ID) {}
396 :
397 : void getAnalysisUsage(AnalysisUsage &AU) const override {
398 : AU.setPreservesCFG();
399 : MachineFunctionPass::getAnalysisUsage(AU);
400 : }
401 :
402 : StringRef getPassName() const override {
403 : return PASS_NAME;
404 : }
405 :
406 : bool runOnMachineFunction(MachineFunction &MF) override;
407 : };
408 :
409 : } // end namespace anonymous
410 :
411 : void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
412 : const char *Msg) const {
413 : const Function &Func = MI->getParent()->getParent()->getFunction();
414 : DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
415 : Func.getContext().diagnose(Diag);
416 : }
417 :
418 : Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
419 1971 : SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
420 : SIAtomicAddrSpace InstrScope) const {
421 1959 : /// TODO: For now assume OpenCL memory model which treats each
422 1959 : /// address space as having a separate happens-before relation, and
423 1959 : /// so an instruction only has ordering with respect to the address
424 1959 : /// space it accesses, and if it accesses multiple address spaces it
425 : /// does not require ordering of operations in different address
426 1959 : /// spaces.
427 1959 : if (SSID == SyncScope::System)
428 : return std::make_tuple(SIAtomicScope::SYSTEM,
429 : SIAtomicAddrSpace::ATOMIC & InstrScope,
430 : false);
431 : if (SSID == MMI->getAgentSSID())
432 : return std::make_tuple(SIAtomicScope::AGENT,
433 : SIAtomicAddrSpace::ATOMIC & InstrScope,
434 : false);
435 0 : if (SSID == MMI->getWorkgroupSSID())
436 : return std::make_tuple(SIAtomicScope::WORKGROUP,
437 0 : SIAtomicAddrSpace::ATOMIC & InstrScope,
438 0 : false);
439 0 : if (SSID == MMI->getWavefrontSSID())
440 0 : return std::make_tuple(SIAtomicScope::WAVEFRONT,
441 : SIAtomicAddrSpace::ATOMIC & InstrScope,
442 : false);
443 1514 : if (SSID == SyncScope::SingleThread)
444 : return std::make_tuple(SIAtomicScope::SINGLETHREAD,
445 : SIAtomicAddrSpace::ATOMIC & InstrScope,
446 : false);
447 : /// TODO: To support HSA Memory Model need to add additional memory
448 : /// scopes that specify that do require cross address space
449 : /// ordering.
450 : return None;
451 1514 : }
452 :
453 : SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
454 : if (AS == AMDGPUAS::FLAT_ADDRESS)
455 321 : return SIAtomicAddrSpace::FLAT;
456 : if (AS == AMDGPUAS::GLOBAL_ADDRESS)
457 : return SIAtomicAddrSpace::GLOBAL;
458 : if (AS == AMDGPUAS::LOCAL_ADDRESS)
459 244 : return SIAtomicAddrSpace::LDS;
460 : if (AS == AMDGPUAS::PRIVATE_ADDRESS)
461 : return SIAtomicAddrSpace::SCRATCH;
462 : if (AS == AMDGPUAS::REGION_ADDRESS)
463 152 : return SIAtomicAddrSpace::GDS;
464 :
465 : return SIAtomicAddrSpace::OTHER;
466 : }
467 76 :
468 : SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
469 : MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
470 : }
471 :
472 : Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
473 : const MachineBasicBlock::iterator &MI) const {
474 : assert(MI->getNumMemOperands() > 0);
475 :
476 : SyncScope::ID SSID = SyncScope::SingleThread;
477 0 : AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
478 48177 : AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
479 0 : SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
480 46458 : bool IsNonTemporal = true;
481 0 :
482 14136 : // Validator should check whether or not MMOs cover the entire set of
483 0 : // locations accessed by the memory instruction.
484 14136 : for (const auto &MMO : MI->memoperands()) {
485 0 : IsNonTemporal &= MMO->isNonTemporal();
486 1224 : InstrAddrSpace |=
487 0 : toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
488 : AtomicOrdering OpOrdering = MMO->getOrdering();
489 : if (OpOrdering != AtomicOrdering::NotAtomic) {
490 : const auto &IsSyncScopeInclusion =
491 : MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
492 19828 : if (!IsSyncScopeInclusion) {
493 39656 : reportUnsupported(MI,
494 : "Unsupported non-inclusive atomic synchronization scope");
495 : return None;
496 48132 : }
497 :
498 : SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
499 : Ordering =
500 : isStrongerThan(Ordering, OpOrdering) ?
501 : Ordering : MMO->getOrdering();
502 : assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
503 : MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
504 : FailureOrdering =
505 : isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
506 : FailureOrdering : MMO->getFailureOrdering();
507 : }
508 96301 : }
509 48177 :
510 : SIAtomicScope Scope = SIAtomicScope::NONE;
511 48177 : SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
512 48177 : bool IsCrossAddressSpaceOrdering = false;
513 48177 : if (Ordering != AtomicOrdering::NotAtomic) {
514 : auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
515 2890 : if (!ScopeOrNone) {
516 1445 : reportUnsupported(MI, "Unsupported atomic synchronization scope");
517 8 : return None;
518 : }
519 : std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
520 : ScopeOrNone.getValue();
521 : if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
522 1437 : ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
523 : reportUnsupported(MI, "Unsupported atomic address space");
524 1437 : return None;
525 1437 : }
526 : }
527 : return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
528 : IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
529 2874 : }
530 :
531 : Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
532 : const MachineBasicBlock::iterator &MI) const {
533 : assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
534 48124 :
535 48124 : if (!(MI->mayLoad() && !MI->mayStore()))
536 48124 : return None;
537 48124 :
538 1436 : // Be conservative if there are no memory operands.
539 1436 : if (MI->getNumMemOperands() == 0)
540 0 : return SIMemOpInfo();
541 :
542 : return constructFromMIWithMMO(MI);
543 : }
544 :
545 1436 : Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
546 : const MachineBasicBlock::iterator &MI) const {
547 4 : assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
548 :
549 : if (!(!MI->mayLoad() && MI->mayStore()))
550 : return None;
551 48120 :
552 : // Be conservative if there are no memory operands.
553 : if (MI->getNumMemOperands() == 0)
554 : return SIMemOpInfo();
555 48236 :
556 : return constructFromMIWithMMO(MI);
557 : }
558 :
559 48236 : Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
560 : const MachineBasicBlock::iterator &MI) const {
561 : assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
562 :
563 17774 : if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
564 : return None;
565 :
566 17760 : AtomicOrdering Ordering =
567 : static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
568 :
569 30465 : SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
570 : auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
571 : if (!ScopeOrNone) {
572 : reportUnsupported(MI, "Unsupported atomic synchronization scope");
573 30465 : return None;
574 : }
575 :
576 : SIAtomicScope Scope = SIAtomicScope::NONE;
577 28846 : SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
578 : bool IsCrossAddressSpaceOrdering = false;
579 : std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
580 28834 : ScopeOrNone.getValue();
581 :
582 : if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
583 1622 : ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
584 : reportUnsupported(MI, "Unsupported atomic address space");
585 : return None;
586 : }
587 3244 :
588 : return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
589 : IsCrossAddressSpaceOrdering);
590 : }
591 78 :
592 : Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
593 78 : const MachineBasicBlock::iterator &MI) const {
594 78 : assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
595 78 :
596 2 : if (!(MI->mayLoad() && MI->mayStore()))
597 : return None;
598 :
599 : // Be conservative if there are no memory operands.
600 : if (MI->getNumMemOperands() == 0)
601 : return SIMemOpInfo();
602 :
603 : return constructFromMIWithMMO(MI);
604 : }
605 :
606 76 : SICacheControl::SICacheControl(const GCNSubtarget &ST) {
607 : TII = ST.getInstrInfo();
608 0 : IV = getIsaVersion(ST.getCPU());
609 : }
610 :
611 : /* static */
612 : std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
613 : GCNSubtarget::Generation Generation = ST.getGeneration();
614 : if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
615 : return make_unique<SIGfx6CacheControl>(ST);
616 1546 : return make_unique<SIGfx7CacheControl>(ST);
617 : }
618 :
619 : bool SIGfx6CacheControl::enableLoadCacheBypass(
620 1546 : const MachineBasicBlock::iterator &MI,
621 : SIAtomicScope Scope,
622 : SIAtomicAddrSpace AddrSpace) const {
623 : assert(MI->mayLoad() && !MI->mayStore());
624 1538 : bool Changed = false;
625 :
626 : if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
627 1538 : /// TODO: Do not set glc for rmw atomic operations as they
628 : /// implicitly bypass the L1 cache.
629 :
630 19828 : switch (Scope) {
631 19828 : case SIAtomicScope::SYSTEM:
632 19828 : case SIAtomicScope::AGENT:
633 19828 : Changed |= enableGLCBit(MI);
634 : break;
635 : case SIAtomicScope::WORKGROUP:
636 19828 : case SIAtomicScope::WAVEFRONT:
637 19828 : case SIAtomicScope::SINGLETHREAD:
638 19828 : // No cache to bypass.
639 6002 : break;
640 13826 : default:
641 : llvm_unreachable("Unsupported synchronization scope");
642 : }
643 0 : }
644 :
645 : /// The scratch address space does not need the global memory caches
646 : /// to be bypassed as all memory operations by the same thread are
647 : /// sequentially consistent, and no other thread can access scratch
648 : /// memory.
649 :
650 119 : /// Other address spaces do not hava a cache.
651 :
652 : return Changed;
653 : }
654 119 :
655 : bool SIGfx6CacheControl::enableNonTemporal(
656 : const MachineBasicBlock::iterator &MI) const {
657 : assert(MI->mayLoad() ^ MI->mayStore());
658 0 : bool Changed = false;
659 :
660 : /// TODO: Do not enableGLCBit if rmw atomic.
661 : Changed |= enableGLCBit(MI);
662 : Changed |= enableSLCBit(MI);
663 :
664 0 : return Changed;
665 0 : }
666 :
667 : bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
668 : SIAtomicScope Scope,
669 : SIAtomicAddrSpace AddrSpace,
670 : Position Pos) const {
671 : bool Changed = false;
672 :
673 : MachineBasicBlock &MBB = *MI->getParent();
674 : DebugLoc DL = MI->getDebugLoc();
675 :
676 0 : if (Pos == Position::AFTER)
677 : ++MI;
678 :
679 0 : if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
680 : switch (Scope) {
681 : case SIAtomicScope::SYSTEM:
682 : case SIAtomicScope::AGENT:
683 : BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
684 : Changed = true;
685 : break;
686 45 : case SIAtomicScope::WORKGROUP:
687 : case SIAtomicScope::WAVEFRONT:
688 0 : case SIAtomicScope::SINGLETHREAD:
689 : // No cache to invalidate.
690 : break;
691 133 : default:
692 : llvm_unreachable("Unsupported synchronization scope");
693 : }
694 : }
695 :
696 : /// The scratch address space does not need the global memory cache
697 133 : /// to be flushed as all memory operations by the same thread are
698 : /// sequentially consistent, and no other thread can access scratch
699 : /// memory.
700 133 :
701 : /// Other address spaces do not hava a cache.
702 :
703 133 : if (Pos == Position::AFTER)
704 133 : --MI;
705 124 :
706 : return Changed;
707 248 : }
708 :
709 124 : bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
710 : SIAtomicScope Scope,
711 : SIAtomicAddrSpace AddrSpace,
712 : SIMemOp Op,
713 : bool IsCrossAddrSpaceOrdering,
714 : Position Pos) const {
715 0 : bool Changed = false;
716 0 :
717 : MachineBasicBlock &MBB = *MI->getParent();
718 : DebugLoc DL = MI->getDebugLoc();
719 :
720 : if (Pos == Position::AFTER)
721 : ++MI;
722 :
723 : bool VMCnt = false;
724 : bool LGKMCnt = false;
725 : bool EXPCnt = false;
726 :
727 133 : if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
728 : switch (Scope) {
729 : case SIAtomicScope::SYSTEM:
730 133 : case SIAtomicScope::AGENT:
731 : VMCnt = true;
732 : break;
733 2623 : case SIAtomicScope::WORKGROUP:
734 : case SIAtomicScope::WAVEFRONT:
735 : case SIAtomicScope::SINGLETHREAD:
736 : // The L1 cache keeps all memory operations in order for
737 : // wavefronts in the same work-group.
738 : break;
739 : default:
740 : llvm_unreachable("Unsupported synchronization scope");
741 2623 : }
742 : }
743 :
744 2623 : if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
745 : switch (Scope) {
746 : case SIAtomicScope::SYSTEM:
747 : case SIAtomicScope::AGENT:
748 : case SIAtomicScope::WORKGROUP:
749 : // If no cross address space ordering then an LDS waitcnt is not
750 : // needed as LDS operations for all waves are executed in a
751 2623 : // total global ordering as observed by all waves. Required if
752 2623 : // also synchronizing with global/GDS memory as LDS operations
753 2383 : // could be reordered with respect to later global/GDS memory
754 : // operations of the same wave.
755 : LGKMCnt = IsCrossAddrSpaceOrdering;
756 2383 : break;
757 : case SIAtomicScope::WAVEFRONT:
758 : case SIAtomicScope::SINGLETHREAD:
759 : // The LDS keeps all memory operations in order for
760 : // the same wavesfront.
761 : break;
762 : default:
763 0 : llvm_unreachable("Unsupported synchronization scope");
764 0 : }
765 : }
766 :
767 : if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
768 2623 : switch (Scope) {
769 1387 : case SIAtomicScope::SYSTEM:
770 1238 : case SIAtomicScope::AGENT:
771 : // If no cross address space ordering then an GDS waitcnt is not
772 : // needed as GDS operations for all waves are executed in a
773 : // total global ordering as observed by all waves. Required if
774 : // also synchronizing with global/LDS memory as GDS operations
775 : // could be reordered with respect to later global/LDS memory
776 : // operations of the same wave.
777 : EXPCnt = IsCrossAddrSpaceOrdering;
778 : break;
779 : case SIAtomicScope::WORKGROUP:
780 1238 : case SIAtomicScope::WAVEFRONT:
781 : case SIAtomicScope::SINGLETHREAD:
782 : // The GDS keeps all memory operations in order for
783 : // the same work-group.
784 : break;
785 : default:
786 0 : llvm_unreachable("Unsupported synchronization scope");
787 0 : }
788 : }
789 :
790 : if (VMCnt || LGKMCnt || EXPCnt) {
791 2623 : unsigned WaitCntImmediate =
792 116 : AMDGPU::encodeWaitcnt(IV,
793 64 : VMCnt ? 0 : getVmcntBitMask(IV),
794 : EXPCnt ? 0 : getExpcntBitMask(IV),
795 : LGKMCnt ? 0 : getLgkmcntBitMask(IV));
796 : BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
797 : Changed = true;
798 : }
799 :
800 : if (Pos == Position::AFTER)
801 : --MI;
802 64 :
803 : return Changed;
804 : }
805 :
806 : bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
807 : SIAtomicScope Scope,
808 : SIAtomicAddrSpace AddrSpace,
809 0 : Position Pos) const {
810 0 : bool Changed = false;
811 :
812 : MachineBasicBlock &MBB = *MI->getParent();
813 : DebugLoc DL = MI->getDebugLoc();
814 2623 :
815 : if (Pos == Position::AFTER)
816 7069 : ++MI;
817 0 :
818 2343 : if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
819 2343 : switch (Scope) {
820 4766 : case SIAtomicScope::SYSTEM:
821 : case SIAtomicScope::AGENT:
822 : BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
823 : Changed = true;
824 2623 : break;
825 : case SIAtomicScope::WORKGROUP:
826 : case SIAtomicScope::WAVEFRONT:
827 2623 : case SIAtomicScope::SINGLETHREAD:
828 : // No cache to invalidate.
829 : break;
830 1159 : default:
831 : llvm_unreachable("Unsupported synchronization scope");
832 : }
833 : }
834 :
835 : /// The scratch address space does not need the global memory cache
836 1159 : /// to be flushed as all memory operations by the same thread are
837 : /// sequentially consistent, and no other thread can access scratch
838 : /// memory.
839 1159 :
840 : /// Other address spaces do not hava a cache.
841 :
842 1159 : if (Pos == Position::AFTER)
843 1159 : --MI;
844 1043 :
845 : return Changed;
846 2086 : }
847 :
848 1043 : bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
849 : if (AtomicPseudoMIs.empty())
850 : return false;
851 :
852 : for (auto &MI : AtomicPseudoMIs)
853 : MI->eraseFromParent();
854 0 :
855 0 : AtomicPseudoMIs.clear();
856 : return true;
857 : }
858 :
859 : bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
860 : MachineBasicBlock::iterator &MI) {
861 : assert(MI->mayLoad() && !MI->mayStore());
862 :
863 : bool Changed = false;
864 :
865 : if (MOI.isAtomic()) {
866 1159 : if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
867 : MOI.getOrdering() == AtomicOrdering::Acquire ||
868 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
869 1159 : Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
870 : MOI.getOrderingAddrSpace());
871 : }
872 19828 :
873 19828 : if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
874 : Changed |= CC->insertWait(MI, MOI.getScope(),
875 : MOI.getOrderingAddrSpace(),
876 144 : SIMemOp::LOAD | SIMemOp::STORE,
877 76 : MOI.getIsCrossAddressSpaceOrdering(),
878 : Position::BEFORE);
879 :
880 68 : if (MOI.getOrdering() == AtomicOrdering::Acquire ||
881 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
882 : Changed |= CC->insertWait(MI, MOI.getScope(),
883 17771 : MOI.getInstrAddrSpace(),
884 : SIMemOp::LOAD,
885 : MOI.getIsCrossAddressSpaceOrdering(),
886 : Position::AFTER);
887 : Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
888 : MOI.getOrderingAddrSpace(),
889 17771 : Position::AFTER);
890 119 : }
891 238 :
892 : return Changed;
893 119 : }
894 :
895 : // Atomic instructions do not have the nontemporal attribute.
896 : if (MOI.isNonTemporal()) {
897 139 : Changed |= CC->enableNonTemporal(MI);
898 79 : return Changed;
899 : }
900 :
901 79 : return Changed;
902 : }
903 :
904 139 : bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
905 : MachineBasicBlock::iterator &MI) {
906 99 : assert(!MI->mayLoad() && MI->mayStore());
907 :
908 : bool Changed = false;
909 99 :
910 : if (MOI.isAtomic()) {
911 99 : if (MOI.getOrdering() == AtomicOrdering::Release ||
912 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
913 99 : Changed |= CC->insertWait(MI, MOI.getScope(),
914 : MOI.getOrderingAddrSpace(),
915 : SIMemOp::LOAD | SIMemOp::STORE,
916 139 : MOI.getIsCrossAddressSpaceOrdering(),
917 : Position::BEFORE);
918 :
919 : return Changed;
920 17632 : }
921 :
922 21 : // Atomic instructions do not have the nontemporal attribute.
923 : if (MOI.isNonTemporal()) {
924 : Changed |= CC->enableNonTemporal(MI);
925 : return Changed;
926 : }
927 :
928 28843 : return Changed;
929 : }
930 :
931 : bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
932 : MachineBasicBlock::iterator &MI) {
933 : assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
934 28843 :
935 139 : AtomicPseudoMIs.push_back(MI);
936 : bool Changed = false;
937 99 :
938 : if (MOI.isAtomic()) {
939 : if (MOI.getOrdering() == AtomicOrdering::Acquire ||
940 99 : MOI.getOrdering() == AtomicOrdering::Release ||
941 : MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
942 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
943 139 : /// TODO: This relies on a barrier always generating a waitcnt
944 : /// for LDS to ensure it is not reordered with the completion of
945 : /// the proceeding LDS operations. If barrier had a memory
946 : /// ordering and memory scope, then library does not need to
947 28704 : /// generate a fence. Could add support in this file for
948 : /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
949 24 : /// adding waitcnt before a S_BARRIER.
950 : Changed |= CC->insertWait(MI, MOI.getScope(),
951 : MOI.getOrderingAddrSpace(),
952 : SIMemOp::LOAD | SIMemOp::STORE,
953 : MOI.getIsCrossAddressSpaceOrdering(),
954 : Position::BEFORE);
955 76 :
956 : if (MOI.getOrdering() == AtomicOrdering::Acquire ||
957 : MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
958 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
959 76 : Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
960 : MOI.getOrderingAddrSpace(),
961 : Position::BEFORE);
962 76 :
963 53 : return Changed;
964 30 : }
965 91 :
966 : return Changed;
967 : }
968 :
969 : bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
970 : MachineBasicBlock::iterator &MI) {
971 : assert(MI->mayLoad() && MI->mayStore());
972 :
973 : bool Changed = false;
974 76 :
975 : if (MOI.isAtomic()) {
976 : if (MOI.getOrdering() == AtomicOrdering::Release ||
977 76 : MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
978 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
979 : MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
980 53 : Changed |= CC->insertWait(MI, MOI.getScope(),
981 114 : MOI.getOrderingAddrSpace(),
982 : SIMemOp::LOAD | SIMemOp::STORE,
983 53 : MOI.getIsCrossAddressSpaceOrdering(),
984 : Position::BEFORE);
985 53 :
986 : if (MOI.getOrdering() == AtomicOrdering::Acquire ||
987 76 : MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
988 : MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
989 : MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
990 : MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
991 : Changed |= CC->insertWait(MI, MOI.getScope(),
992 : MOI.getOrderingAddrSpace(),
993 1532 : isAtomicRet(*MI) ? SIMemOp::LOAD :
994 : SIMemOp::STORE,
995 : MOI.getIsCrossAddressSpaceOrdering(),
996 : Position::AFTER);
997 : Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
998 : MOI.getOrderingAddrSpace(),
999 1532 : Position::AFTER);
1000 1150 : }
1001 1084 :
1002 1180 : return Changed;
1003 50 : }
1004 1130 :
1005 : return Changed;
1006 : }
1007 1130 :
1008 : bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1009 : bool Changed = false;
1010 1150 :
1011 1084 : SIMemOpAccess MOA(MF);
1012 50 : CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1013 1230 :
1014 : for (auto &MBB : MF) {
1015 1757 : for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1016 : if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1017 : continue;
1018 :
1019 1140 : if (const auto &MOI = MOA.getLoadInfo(MI))
1020 : Changed |= expandLoad(MOI.getValue(), MI);
1021 1140 : else if (const auto &MOI = MOA.getStoreInfo(MI))
1022 : Changed |= expandStore(MOI.getValue(), MI);
1023 1140 : else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1024 : Changed |= expandAtomicFence(MOI.getValue(), MI);
1025 : else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1026 1180 : Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1027 : }
1028 : }
1029 :
1030 : Changed |= removeAtomicPseudoMIs();
1031 : return Changed;
1032 19828 : }
1033 :
1034 : INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1035 :
1036 39656 : char SIMemoryLegalizer::ID = 0;
1037 : char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1038 42076 :
1039 340944 : FunctionPass *llvm::createSIMemoryLegalizerPass() {
1040 318696 : return new SIMemoryLegalizer();
1041 : }
|