33 "amdgpu-barrier-signal-wait-latency",
34 cl::desc(
"Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
35 "to encourage scheduling independent work between them"),
48 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"wavefront"));
49 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"wavefront-one-as"));
50 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"singlethread-one-as"));
53 if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
55 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"workgroup"));
58 void apply(ScheduleDAGInstrs *DAG)
override;
63 SDep ForwardD = PredDep;
66 if (SuccDep == ForwardD) {
78 SDep ForwardD = PredDep;
81 if (SuccDep == ForwardD) {
92 const SIInstrInfo *
TII =
static_cast<const SIInstrInfo *
>(DAG->
TII);
93 constexpr unsigned FenceLatency = 2000;
98 for (SUnit &SU : DAG->
SUnits) {
100 unsigned Op =
MI->getOpcode();
102 if (
Op == AMDGPU::ATOMIC_FENCE) {
110 for (SDep &PredDep : SU.
Preds) {
116 if (!
MI->mayLoad() ||
MI->mayStore())
118 addLatencyToEdge(PredDep, SU, FenceLatency);
120 }
else if (
Op == AMDGPU::S_BARRIER_WAIT) {
121 for (SDep &PredDep : SU.
Preds) {
123 const MachineInstr *PredMI = PredSU->
getInstr();
125 addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
128 }
else if (
TII->isLDSDMA(*
MI)) {
133 }
else if (
Op == AMDGPU::S_WAIT_TENSORCNT ||
134 Op == AMDGPU::S_WAIT_ASYNCCNT) {
135 auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU,
137 if (RegionLDSDMA.
size() <=
static_cast<uint64_t
>(
Count)) {
142 auto I = RegionLDSDMA.
rbegin(),
E = RegionLDSDMA.
rend();
143 for (;
I !=
E;
I++) {
144 if (Counter >=
Count)
147 if (SU->NodeNum == (*I)->NodeNum)
155 int64_t WaitVal =
MI->getOperand(0).getImm();
156 for (SDep &PredDep : SU.Preds) {
157 if (PredDep.
getKind() != SDep::Kind::Data)
161 Register LDSDMACnt = AMDGPU::TENSORcnt;
163 if (
Op == AMDGPU::S_WAIT_ASYNCCNT) {
164 LDSDMACnt = AMDGPU::ASYNCcnt;
168 if (DepReg != LDSDMACnt)
179 if (!needWaitFor(
Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync
182 setLatencyForEdge(PredDep, SU, 1);
191std::unique_ptr<ScheduleDAGMutation>
193 return std::make_unique<BarrierLatency>(MF);
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Promote Memory to Register
Interface definition for SIInstrInfo.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
This is an important class for using LLVM in a threaded context.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Kind getKind() const
Returns an enum value representing the kind of the dependence.
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Register getReg() const
Returns the register associated with this edge.
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
Mutate the DAG as a postpass after normal DAG building.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
reverse_iterator rbegin()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
DWARFExpression::Operation Op