docs/doxygen/AMDGPUBarrierLatency_8cpp_source.html

//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This file contains a DAG scheduling mutation to add latency to:

///       1. Barrier edges between ATOMIC_FENCE instructions and preceding

///          memory accesses potentially affected by the fence.

///          This encourages the scheduling of more instructions before

///          ATOMIC_FENCE instructions.  ATOMIC_FENCE instructions may

///          introduce wait counting or indicate an impending S_BARRIER

///          wait.  Having more instructions in-flight across these

///          constructs improves latency hiding.

///       2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.

///          This encourages independent work to be scheduled between

///          signal and wait, hiding barrier synchronization latency.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUBarrierLatency.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIInstrInfo.h"

#include "llvm/CodeGen/ScheduleDAGInstrs.h"

#include "llvm/Support/CommandLine.h"


using namespace llvm;


static cl::opt<unsigned> BarrierSignalWaitLatencyOpt(

    "amdgpu-barrier-signal-wait-latency",

    cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "

             "to encourage scheduling independent work between them"),

    cl::init(16), cl::Hidden);


namespace {


class BarrierLatency : public ScheduleDAGMutation {

private:

  SmallSet<SyncScope::ID, 4> IgnoredScopes;


public:

  BarrierLatency(MachineFunction *MF) {

    LLVMContext &Context = MF->getFunction().getContext();

    IgnoredScopes.insert(SyncScope::SingleThread);

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));


    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

    bool TgSplit =

        ST.hasTgSplitSupport() && AMDGPU::isTgSplitEnabled(MF->getFunction());

    if (!ST.requiresWaitOnWorkgroupReleaseFence(TgSplit)) {

      // Prior to GFX10 workgroup scope does not normally require waitcnts

      IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));

    }

  }

  void apply(ScheduleDAGInstrs *DAG) override;

};


void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {

  SUnit *PredSU = PredDep.getSUnit();

  SDep ForwardD = PredDep;

  ForwardD.setSUnit(&SU);

  for (SDep &SuccDep : PredSU->Succs) {

    if (SuccDep == ForwardD) {

      SuccDep.setLatency(SuccDep.getLatency() + Latency);

      break;

    }

  }

  PredDep.setLatency(PredDep.getLatency() + Latency);

  PredSU->setDepthDirty();

  SU.setDepthDirty();

}


void setLatencyForEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {

  SUnit *PredSU = PredDep.getSUnit();

  SDep ForwardD = PredDep;

  ForwardD.setSUnit(&SU);

  for (SDep &SuccDep : PredSU->Succs) {

    if (SuccDep == ForwardD) {

      SuccDep.setLatency(Latency);

      break;

    }

  }

  PredDep.setLatency(Latency);

  PredSU->setDepthDirty();

  SU.setDepthDirty();

}


void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {

  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);

  constexpr unsigned FenceLatency = 2000;

  const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;

  SmallVector<SUnit *, 8> RegionTDM;

  SmallVector<SUnit *, 8> RegionAsync;

  const TargetSchedModel *SchedModel = DAG->getSchedModel();


  for (SUnit &SU : DAG->SUnits) {

    const MachineInstr *MI = SU.getInstr();

    unsigned Op = MI->getOpcode();


    if (Op == AMDGPU::ATOMIC_FENCE) {

      // Update latency on barrier edges of ATOMIC_FENCE.

      // Ignore scopes not expected to have any latency.

      SyncScope::ID SSID =

          static_cast<SyncScope::ID>(MI->getOperand(1).getImm());

      if (IgnoredScopes.contains(SSID))

        continue;


      for (SDep &PredDep : SU.Preds) {

        if (!PredDep.isBarrier())

          continue;

        SUnit *PredSU = PredDep.getSUnit();

        MachineInstr *MI = PredSU->getInstr();

        // Only consider memory loads

        if (!MI->mayLoad() || MI->mayStore())

          continue;


        addLatencyToEdge(PredDep, SU,

                         SchedModel ? SchedModel->computeInstrLatency(MI, false)

                                    : FenceLatency);

      }

    } else if (Op == AMDGPU::S_BARRIER_WAIT) {

      for (SDep &PredDep : SU.Preds) {

        SUnit *PredSU = PredDep.getSUnit();

        const MachineInstr *PredMI = PredSU->getInstr();

        if (TII->isBarrierStart(PredMI->getOpcode())) {

          addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);

        }

      }

    } else if (TII->isLDSDMA(*MI)) {

      if (MI->getDesc().TSFlags & SIInstrFlags::TENSOR_CNT)

        RegionTDM.push_back(&SU);

      else if (MI->getDesc().TSFlags & SIInstrFlags::ASYNC_CNT)

        RegionAsync.push_back(&SU);

    } else if (Op == AMDGPU::S_WAIT_TENSORCNT ||

               Op == AMDGPU::S_WAIT_ASYNCCNT) {

      auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU,

                             int64_t Count) {

        if (RegionLDSDMA.size() <= static_cast<uint64_t>(Count)) {

          return false;

        }


        int64_t Counter = 0;

        auto I = RegionLDSDMA.rbegin(), E = RegionLDSDMA.rend();

        for (; I != E; I++) {

          if (Counter >= Count)

            return true;


          if (SU->NodeNum == (*I)->NodeNum)

            return false;


          ++Counter;

        }

        llvm_unreachable("Malformed RegionLDSDMA");

      };


      int64_t WaitVal = MI->getOperand(0).getImm();

      for (SDep &PredDep : SU.Preds) {

        if (PredDep.getKind() != SDep::Kind::Data)

          continue;


        Register DepReg = PredDep.getReg();

        Register LDSDMACnt = AMDGPU::TENSORcnt;

        uint64_t LDSDMAFlags = SIInstrFlags::TENSOR_CNT;

        if (Op == AMDGPU::S_WAIT_ASYNCCNT) {

          LDSDMACnt = AMDGPU::ASYNCcnt;

          LDSDMAFlags = SIInstrFlags::ASYNC_CNT;

        }


        if (DepReg != LDSDMACnt)

          continue;


        SUnit *PredSU = PredDep.getSUnit();


        // The data dep can be carried by a non-LDSDMA SU

        // (e.g. an intervening COPY or pseudo). Such predecessors are not

        // tracked, so needWaitFor cannot reason about them.

        if (!(PredSU->getInstr()->getDesc().TSFlags & LDSDMAFlags))

          continue;


        if (!needWaitFor(Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync

                                                       : RegionTDM,

                         PredSU, WaitVal)) {

          setLatencyForEdge(PredDep, SU, 1);

        }

      }

    }

  }

}


} // end namespace


std::unique_ptr<ScheduleDAGMutation>


llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {

  return std::make_unique<BarrierLatency>(MF);

}


BarrierSignalWaitLatencyOpt
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)

AMDGPUBarrierLatency.h

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

CommandLine.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

I
#define I(x, y, z)
Definition MD5.cpp:57

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

SIInstrInfo.h
Interface definition for SIInstrInfo.

ScheduleDAGInstrs.h

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::MCInstrDesc::TSFlags
uint64_t TSFlags
Definition MCInstrDesc.h:216

llvm::MachineFunction
Definition MachineFunction.h:295

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:789

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:750

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:602

llvm::MachineInstr::getDesc
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition MachineInstr.h:599

llvm::SDep
Scheduling dependency.
Definition ScheduleDAG.h:52

llvm::SDep::getSUnit
SUnit * getSUnit() const
Definition ScheduleDAG.h:510

llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition ScheduleDAG.h:516

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition ScheduleDAG.h:148

llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition ScheduleDAG.h:143

llvm::SDep::setSUnit
void setSUnit(SUnit *SU)
Definition ScheduleDAG.h:513

llvm::SDep::getReg
Register getReg() const
Returns the register associated with this edge.
Definition ScheduleDAG.h:217

llvm::SDep::isBarrier
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Definition ScheduleDAG.h:175

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition ScheduleDAG.h:250

llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition ScheduleDAG.h:271

llvm::SUnit::setDepthDirty
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
Definition ScheduleDAG.cpp:217

llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition ScheduleDAG.h:270

llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition ScheduleDAG.h:400

llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition ScheduleDAGInstrs.h:118

llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition ScheduleDAGInstrs.h:274

llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition ScheduleDAGMutation.h:24

llvm::ScheduleDAG::TII
const TargetInstrInfo * TII
Target instruction information.
Definition ScheduleDAG.h:587

llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition ScheduleDAG.h:591

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134

llvm::SmallSet::contains
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::rbegin
reverse_iterator rbegin()
Definition SmallVector.h:282

llvm::SmallVectorTemplateCommon::rend
reverse_iterator rend()
Definition SmallVector.h:284

llvm::cl::opt
Definition CommandLine.h:1472

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPU::isTgSplitEnabled
bool isTgSplitEnabled(const Function &F)
Definition AMDGPUBaseInfo.h:1038

llvm::SIInstrFlags::TENSOR_CNT
@ TENSOR_CNT
Definition SIDefines.h:117

llvm::SIInstrFlags::ASYNC_CNT
@ ASYNC_CNT
Definition SIDefines.h:122

llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55

llvm::SyncScope::ID
uint8_t ID
Definition LLVMContext.h:47

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition CommandLine.h:1358

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:440

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::createAMDGPUBarrierLatencyDAGMutation
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
Definition AMDGPUBarrierLatency.cpp:198

llvm::Latency
@ Latency
Definition SIMachineScheduler.h:34

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:1158

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1162

llvm::Count
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::cl::desc
Definition CommandLine.h:406