24#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
28 cl::desc(
"Enable required s_wait_alu on SGPR hazards"));
32 cl::desc(
"Cull hazards on function boundaries"));
37 cl::desc(
"Cull hazards on memory waits"));
41 cl::desc(
"Number of tracked SGPRs before initiating hazard cull on memory "
46class AMDGPUWaitSGPRHazards {
54 bool EnableSGPRHazardWaits;
55 bool CullSGPRHazardsOnFunctionBoundary;
56 bool CullSGPRHazardsAtMemWait;
57 unsigned CullSGPRHazardsMemWaitThreshold;
59 AMDGPUWaitSGPRHazards() =
default;
62 static std::optional<unsigned> sgprNumber(
Register Reg,
69 case AMDGPU::SGPR_NULL:
70 case AMDGPU::SGPR_NULL64:
75 unsigned RegN = TRI.getHWRegIndex(
Reg);
82 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
93 while (
I->isBundledWithPred())
99 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
103 const unsigned NewBytes = 4;
105 "Unexpected instruction insertion in bundle");
108 while (NextMI != End && NextMI->isBundledWithPred()) {
109 for (
auto &Operand : NextMI->operands()) {
110 if (Operand.isGlobal())
111 Operand.setOffset(Operand.getOffset() + NewBytes);
118 static constexpr unsigned None = 0;
119 static constexpr unsigned SALU = (1 << 0);
120 static constexpr unsigned VALU = (1 << 1);
122 std::bitset<64> Tracked;
123 std::bitset<128> SALUHazards;
124 std::bitset<128> VALUHazards;
125 unsigned VCCHazard = None;
126 bool ActiveFlat =
false;
128 bool merge(
const HazardState &
RHS) {
129 HazardState Orig(*
this);
131 return (*
this != Orig);
135 return Tracked ==
RHS.Tracked && SALUHazards ==
RHS.SALUHazards &&
136 VALUHazards ==
RHS.VALUHazards && VCCHazard ==
RHS.VCCHazard &&
137 ActiveFlat ==
RHS.ActiveFlat;
143 Tracked |=
RHS.Tracked;
144 SALUHazards |=
RHS.SALUHazards;
145 VALUHazards |=
RHS.VALUHazards;
146 VCCHazard |=
RHS.VCCHazard;
147 ActiveFlat |=
RHS.ActiveFlat;
151 struct BlockHazardState {
156 DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
158 static constexpr unsigned WAVE32_NOPS = 4;
159 static constexpr unsigned WAVE64_NOPS = 8;
161 void insertHazardCull(MachineBasicBlock &
MBB,
164 unsigned Count = DsNopCount;
169 unsigned mergeMasks(
unsigned Mask1,
unsigned Mask2) {
200 auto MBB =
MI->getParent();
205 if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
208 It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
212 bool runOnMachineBasicBlock(MachineBasicBlock &
MBB,
bool Emit) {
213 enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
215 HazardState State = BlockState[&
MBB].In;
216 SmallSet<Register, 8> SeenRegs;
223 if (
MI->isMetaInstruction())
227 if (
MI->getOpcode() == AMDGPU::DS_NOP) {
228 if (++DsNops >= DsNopCount)
229 State.Tracked.reset();
237 State.ActiveFlat =
true;
243 State.VCCHazard = HazardState::None;
244 State.SALUHazards.reset();
245 State.VALUHazards.reset();
250 if (
MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
251 unsigned int Mask =
MI->getOperand(0).getImm();
253 State.VCCHazard &= ~HazardState::VALU;
255 State.SALUHazards.reset();
256 State.VCCHazard &= ~HazardState::SALU;
259 State.VALUHazards.reset();
264 if (CullSGPRHazardsAtMemWait &&
265 (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
266 MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
267 MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
268 (
MI->getOperand(0).isImm() &&
MI->getOperand(0).getImm() == 0) &&
269 (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
270 if (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
271 State.ActiveFlat =
false;
273 State.Tracked.reset();
275 insertHazardCull(
MBB,
MI);
283 if (!IsVALU && !IsSALU)
288 auto processOperand = [&](
const MachineOperand &
Op,
bool IsUse) {
293 if (!TRI->isSGPRReg(*MRI,
Reg))
300 auto RegNumber = sgprNumber(
Reg, *TRI);
306 unsigned RegN = *RegNumber;
307 unsigned PairN = (RegN >> 1) & 0x3f;
311 if (!State.Tracked[PairN]) {
313 State.Tracked.set(PairN);
324 if (State.VCCHazard & HazardState::VALU)
325 State.VCCHazard = HazardState::None;
327 State.VALUHazards.reset();
331 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
332 Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
333 Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
335 if (isVCC(
Reg) && State.VCCHazard) {
338 if (State.VCCHazard & HazardState::SALU)
340 if (State.VCCHazard & HazardState::VALU)
346 State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
348 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
350 State.SALUHazards.set(RegN + RegIdx);
352 State.VALUHazards.set(RegN + RegIdx);
359 (
MI->isCall() ||
MI->isReturn() ||
MI->isIndirectBranch()) &&
360 MI->getOpcode() != AMDGPU::S_ENDPGM &&
361 MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
364 const bool HasImplicitVCC =
371 if (State.VCCHazard & HazardState::VALU)
373 if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
375 if (State.VALUHazards.any())
377 if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
378 State.Tracked.reset();
380 insertHazardCull(
MBB,
MI);
385 for (
const MachineOperand &
Op :
MI->all_uses()) {
386 if (
Op.isImplicit() &&
387 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
389 processOperand(
Op,
true);
397 State.VCCHazard &= ~HazardState::VALU;
400 if (
Wait & WA_SALU) {
401 State.SALUHazards.reset();
402 State.VCCHazard &= ~HazardState::SALU;
405 if (
Wait & WA_VALU) {
406 State.VALUHazards.reset();
410 if (!mergeConsecutiveWaitAlus(
MI, Mask)) {
412 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
421 if (
MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
426 for (
const MachineOperand &
Op :
MI->all_defs()) {
427 if (
Op.isImplicit() &&
428 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
430 processOperand(
Op,
false);
434 BlockHazardState &BS = BlockState[&
MBB];
435 bool Changed = State != BS.Out;
437 assert(!
Changed &&
"Hazard state should not change on emit pass");
445 bool run(MachineFunction &MF) {
447 if (!ST->hasVALUReadSGPRHazard())
458 "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
460 CullSGPRHazardsOnFunctionBoundary =
463 CullSGPRHazardsAtMemWait =
466 CullSGPRHazardsMemWaitThreshold =
468 "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
469 CullSGPRHazardsMemWaitThreshold);
472 if (!EnableSGPRHazardWaits)
475 TII = ST->getInstrInfo();
476 TRI = ST->getRegisterInfo();
478 DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
482 !CullSGPRHazardsOnFunctionBoundary) {
485 MachineBasicBlock &EntryBlock = MF.
front();
486 BlockState[&EntryBlock].In.Tracked.set();
497 SetVector<MachineBasicBlock *> Worklist;
500 while (!Worklist.
empty()) {
502 bool Changed = runOnMachineBasicBlock(
MBB,
false);
505 HazardState NewState = BlockState[&
MBB].Out;
509 auto &SuccState = BlockState[Succ];
510 if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
511 if (SuccState.In != NewState) {
512 SuccState.In = NewState;
515 }
else if (SuccState.In.merge(NewState)) {
538 AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
540 bool runOnMachineFunction(MachineFunction &MF)
override {
541 return AMDGPUWaitSGPRHazards().run(MF);
544 void getAnalysisUsage(AnalysisUsage &AU)
const override {
552char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
557 "AMDGPU Insert waits for SGPR read hazards",
false,
false)
562 if (AMDGPUWaitSGPRHazards().run(MF))
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static cl::opt< bool > GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on memory waits"))
static cl::opt< unsigned > GlobalCullSGPRHazardsMemWaitThreshold("amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " "wait"))
static cl::opt< bool > GlobalCullSGPRHazardsOnFunctionBoundary("amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on function boundaries"))
static cl::opt< bool > GlobalEnableSGPRHazardWaits("amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, cl::desc("Enable required s_wait_alu on SGPR hazards"))
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static void updateGetPCBundle(MachineInstr *NewMI)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
instr_iterator instr_begin()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isBundled() const
Return true if this instruction part of a bundle.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
int getNumOccurrences() const
self_iterator getIterator()
unsigned decodeFieldVaVcc(unsigned Encoded)
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version)
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt, const IsaVersion &Version)
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaSsrc(unsigned Encoded)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ Emitted
Assigned address, still materializing.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
char & AMDGPUWaitSGPRHazardsLegacyID
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
FunctionAddr VTableAddr uintptr_t uintptr_t Version
auto reverse(ContainerTy &&C)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
DWARFExpression::Operation Op
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.