LLVM 23.0.0git
AMDGPUBarrierLatency.cpp
Go to the documentation of this file.
1//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file contains a DAG scheduling mutation to add latency to:
10/// 1. Barrier edges between ATOMIC_FENCE instructions and preceding
11/// memory accesses potentially affected by the fence.
12/// This encourages the scheduling of more instructions before
13/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
14/// introduce wait counting or indicate an impending S_BARRIER
15/// wait. Having more instructions in-flight across these
16/// constructs improves latency hiding.
17/// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.
18/// This encourages independent work to be scheduled between
19/// signal and wait, hiding barrier synchronization latency.
20//
21//===----------------------------------------------------------------------===//
22
24#include "GCNSubtarget.h"
26#include "SIInstrInfo.h"
29
30using namespace llvm;
31
33 "amdgpu-barrier-signal-wait-latency",
34 cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
35 "to encourage scheduling independent work between them"),
36 cl::init(16), cl::Hidden);
37
38namespace {
39
40class BarrierLatency : public ScheduleDAGMutation {
41private:
42 SmallSet<SyncScope::ID, 4> IgnoredScopes;
43
44public:
45 BarrierLatency(MachineFunction *MF) {
46 LLVMContext &Context = MF->getFunction().getContext();
47 IgnoredScopes.insert(SyncScope::SingleThread);
48 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
49 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
50 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
51
52 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
53 if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
54 // Prior to GFX10 workgroup scope does not normally require waitcnts
55 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));
56 }
57 }
58 void apply(ScheduleDAGInstrs *DAG) override;
59};
60
61void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
62 SUnit *PredSU = PredDep.getSUnit();
63 SDep ForwardD = PredDep;
64 ForwardD.setSUnit(&SU);
65 for (SDep &SuccDep : PredSU->Succs) {
66 if (SuccDep == ForwardD) {
67 SuccDep.setLatency(SuccDep.getLatency() + Latency);
68 break;
69 }
70 }
71 PredDep.setLatency(PredDep.getLatency() + Latency);
72 PredSU->setDepthDirty();
73 SU.setDepthDirty();
74}
75
76void setLatencyForEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
77 SUnit *PredSU = PredDep.getSUnit();
78 SDep ForwardD = PredDep;
79 ForwardD.setSUnit(&SU);
80 for (SDep &SuccDep : PredSU->Succs) {
81 if (SuccDep == ForwardD) {
82 SuccDep.setLatency(Latency);
83 break;
84 }
85 }
86 PredDep.setLatency(Latency);
87 PredSU->setDepthDirty();
88 SU.setDepthDirty();
89}
90
91void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
92 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
93 constexpr unsigned FenceLatency = 2000;
94 const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
96 SmallVector<SUnit *, 8> RegionAsync;
97
98 for (SUnit &SU : DAG->SUnits) {
99 const MachineInstr *MI = SU.getInstr();
100 unsigned Op = MI->getOpcode();
101
102 if (Op == AMDGPU::ATOMIC_FENCE) {
103 // Update latency on barrier edges of ATOMIC_FENCE.
104 // Ignore scopes not expected to have any latency.
105 SyncScope::ID SSID =
106 static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
107 if (IgnoredScopes.contains(SSID))
108 continue;
109
110 for (SDep &PredDep : SU.Preds) {
111 if (!PredDep.isBarrier())
112 continue;
113 SUnit *PredSU = PredDep.getSUnit();
114 MachineInstr *MI = PredSU->getInstr();
115 // Only consider memory loads
116 if (!MI->mayLoad() || MI->mayStore())
117 continue;
118 addLatencyToEdge(PredDep, SU, FenceLatency);
119 }
120 } else if (Op == AMDGPU::S_BARRIER_WAIT) {
121 for (SDep &PredDep : SU.Preds) {
122 SUnit *PredSU = PredDep.getSUnit();
123 const MachineInstr *PredMI = PredSU->getInstr();
124 if (TII->isBarrierStart(PredMI->getOpcode())) {
125 addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
126 }
127 }
128 } else if (TII->isLDSDMA(*MI)) {
129 if (MI->getDesc().TSFlags & SIInstrFlags::TENSOR_CNT)
130 RegionTDM.push_back(&SU);
131 else if (MI->getDesc().TSFlags & SIInstrFlags::ASYNC_CNT)
132 RegionAsync.push_back(&SU);
133 } else if (Op == AMDGPU::S_WAIT_TENSORCNT ||
134 Op == AMDGPU::S_WAIT_ASYNCCNT) {
135 auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU,
136 int64_t Count) {
137 if (RegionLDSDMA.size() <= static_cast<uint64_t>(Count)) {
138 return false;
139 }
140
141 int64_t Counter = 0;
142 auto I = RegionLDSDMA.rbegin(), E = RegionLDSDMA.rend();
143 for (; I != E; I++) {
144 if (Counter >= Count)
145 return true;
146
147 if (SU->NodeNum == (*I)->NodeNum)
148 return false;
149
150 ++Counter;
151 }
152 llvm_unreachable("Malformed RegionLDSDMA");
153 };
154
155 int64_t WaitVal = MI->getOperand(0).getImm();
156 for (SDep &PredDep : SU.Preds) {
157 if (PredDep.getKind() != SDep::Kind::Data)
158 continue;
159
160 Register DepReg = PredDep.getReg();
161 Register LDSDMACnt = AMDGPU::TENSORcnt;
162 uint64_t LDSDMAFlags = SIInstrFlags::TENSOR_CNT;
163 if (Op == AMDGPU::S_WAIT_ASYNCCNT) {
164 LDSDMACnt = AMDGPU::ASYNCcnt;
165 LDSDMAFlags = SIInstrFlags::ASYNC_CNT;
166 }
167
168 if (DepReg != LDSDMACnt)
169 continue;
170
171 SUnit *PredSU = PredDep.getSUnit();
172
173 // The data dep can be carried by a non-LDSDMA SU
174 // (e.g. an intervening COPY or pseudo). Such predecessors are not
175 // tracked, so needWaitFor cannot reason about them.
176 if (!(PredSU->getInstr()->getDesc().TSFlags & LDSDMAFlags))
177 continue;
178
179 if (!needWaitFor(Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync
180 : RegionTDM,
181 PredSU, WaitVal)) {
182 setLatencyForEdge(PredDep, SU, 1);
183 }
184 }
185 }
186 }
187}
188
189} // end namespace
190
191std::unique_ptr<ScheduleDAGMutation>
193 return std::make_unique<BarrierLatency>(MF);
194}
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Promote Memory to Register
Definition Mem2Reg.cpp:110
Interface definition for SIInstrInfo.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
Kind getKind() const
Returns an enum value representing the kind of the dependence.
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
void setSUnit(SUnit *SU)
Register getReg() const
Returns the register associated with this edge.
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
Mutate the DAG as a postpass after normal DAG building.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
DWARFExpression::Operation Op