23#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
27 cl::desc(
"Enable required s_wait_alu on SGPR hazards"));
31 cl::desc(
"Cull hazards on function boundaries"));
36 cl::desc(
"Cull hazards on memory waits"));
40 cl::desc(
"Number of tracked SGPRs before initiating hazard cull on memory "
45class AMDGPUWaitSGPRHazards {
52 bool EnableSGPRHazardWaits;
53 bool CullSGPRHazardsOnFunctionBoundary;
54 bool CullSGPRHazardsAtMemWait;
55 unsigned CullSGPRHazardsMemWaitThreshold;
57 AMDGPUWaitSGPRHazards() {}
60 static std::optional<unsigned> sgprNumber(
Register Reg,
61 const SIRegisterInfo &TRI) {
67 case AMDGPU::SGPR_NULL:
68 case AMDGPU::SGPR_NULL64:
73 unsigned RegN = TRI.getHWRegIndex(
Reg);
80 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
91 while (
I->isBundledWithPred())
97 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
101 const unsigned NewBytes = 4;
103 "Unexpected instruction insertion in bundle");
106 while (NextMI != End && NextMI->isBundledWithPred()) {
107 for (
auto &Operand : NextMI->operands()) {
108 if (Operand.isGlobal())
109 Operand.setOffset(Operand.getOffset() + NewBytes);
116 static constexpr unsigned None = 0;
117 static constexpr unsigned SALU = (1 << 0);
118 static constexpr unsigned VALU = (1 << 1);
120 std::bitset<64> Tracked;
121 std::bitset<128> SALUHazards;
122 std::bitset<128> VALUHazards;
123 unsigned VCCHazard = None;
124 bool ActiveFlat =
false;
126 bool merge(
const HazardState &
RHS) {
127 HazardState Orig(*
this);
129 return (*
this != Orig);
133 return Tracked ==
RHS.Tracked && SALUHazards ==
RHS.SALUHazards &&
134 VALUHazards ==
RHS.VALUHazards && VCCHazard ==
RHS.VCCHazard &&
135 ActiveFlat ==
RHS.ActiveFlat;
141 Tracked |=
RHS.Tracked;
142 SALUHazards |=
RHS.SALUHazards;
143 VALUHazards |=
RHS.VALUHazards;
144 VCCHazard |=
RHS.VCCHazard;
145 ActiveFlat |=
RHS.ActiveFlat;
149 struct BlockHazardState {
154 DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
156 static constexpr unsigned WAVE32_NOPS = 4;
157 static constexpr unsigned WAVE64_NOPS = 8;
159 void insertHazardCull(MachineBasicBlock &
MBB,
162 unsigned Count = DsNopCount;
167 unsigned mergeMasks(
unsigned Mask1,
unsigned Mask2) {
168 unsigned Mask = 0xffff;
195 auto MBB =
MI->getParent();
200 if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
203 It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
207 bool runOnMachineBasicBlock(MachineBasicBlock &
MBB,
bool Emit) {
208 enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
210 HazardState State = BlockState[&
MBB].In;
211 SmallSet<Register, 8> SeenRegs;
218 if (
MI->isMetaInstruction())
222 if (
MI->getOpcode() == AMDGPU::DS_NOP) {
223 if (++DsNops >= DsNopCount)
224 State.Tracked.reset();
232 State.ActiveFlat =
true;
238 State.VCCHazard = HazardState::None;
239 State.SALUHazards.reset();
240 State.VALUHazards.reset();
245 if (
MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
246 unsigned int Mask =
MI->getOperand(0).getImm();
248 State.VCCHazard &= ~HazardState::VALU;
250 State.SALUHazards.reset();
251 State.VCCHazard &= ~HazardState::SALU;
254 State.VALUHazards.reset();
259 if (CullSGPRHazardsAtMemWait &&
260 (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
261 MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
262 MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
263 (
MI->getOperand(0).isImm() &&
MI->getOperand(0).getImm() == 0) &&
264 (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
265 if (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
266 State.ActiveFlat =
false;
268 State.Tracked.reset();
270 insertHazardCull(
MBB,
MI);
278 if (!IsVALU && !IsSALU)
283 auto processOperand = [&](
const MachineOperand &
Op,
bool IsUse) {
288 if (!TRI->isSGPRReg(*MRI,
Reg))
295 auto RegNumber = sgprNumber(
Reg, *TRI);
301 unsigned RegN = *RegNumber;
302 unsigned PairN = (RegN >> 1) & 0x3f;
306 if (!State.Tracked[PairN]) {
308 State.Tracked.set(PairN);
319 if (State.VCCHazard & HazardState::VALU)
320 State.VCCHazard = HazardState::None;
322 State.VALUHazards.reset();
326 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
327 Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
328 Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
330 if (isVCC(
Reg) && State.VCCHazard) {
333 if (State.VCCHazard & HazardState::SALU)
335 if (State.VCCHazard & HazardState::VALU)
341 State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
343 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
345 State.SALUHazards.set(RegN + RegIdx);
347 State.VALUHazards.set(RegN + RegIdx);
354 (
MI->isCall() ||
MI->isReturn() ||
MI->isIndirectBranch()) &&
355 MI->getOpcode() != AMDGPU::S_ENDPGM &&
356 MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
359 const bool HasImplicitVCC =
366 if (State.VCCHazard & HazardState::VALU)
368 if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
370 if (State.VALUHazards.any())
372 if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
373 State.Tracked.reset();
375 insertHazardCull(
MBB,
MI);
380 for (
const MachineOperand &
Op :
MI->all_uses()) {
381 if (
Op.isImplicit() &&
382 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
384 processOperand(
Op,
true);
390 unsigned Mask = 0xffff;
392 State.VCCHazard &= ~HazardState::VALU;
395 if (
Wait & WA_SALU) {
396 State.SALUHazards.reset();
397 State.VCCHazard &= ~HazardState::SALU;
400 if (
Wait & WA_VALU) {
401 State.VALUHazards.reset();
405 if (!mergeConsecutiveWaitAlus(
MI, Mask)) {
407 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
416 if (
MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
421 for (
const MachineOperand &
Op :
MI->all_defs()) {
422 if (
Op.isImplicit() &&
423 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
425 processOperand(
Op,
false);
429 BlockHazardState &BS = BlockState[&
MBB];
430 bool Changed = State != BS.Out;
432 assert(!
Changed &&
"Hazard state should not change on emit pass");
440 bool run(MachineFunction &MF) {
442 if (!
ST.hasVALUReadSGPRHazard())
453 "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
455 CullSGPRHazardsOnFunctionBoundary =
458 CullSGPRHazardsAtMemWait =
461 CullSGPRHazardsMemWaitThreshold =
463 "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
464 CullSGPRHazardsMemWaitThreshold);
467 if (!EnableSGPRHazardWaits)
470 TII =
ST.getInstrInfo();
471 TRI =
ST.getRegisterInfo();
473 DsNopCount =
ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
477 !CullSGPRHazardsOnFunctionBoundary) {
480 MachineBasicBlock &EntryBlock = MF.
front();
481 BlockState[&EntryBlock].In.Tracked.set();
492 SetVector<MachineBasicBlock *> Worklist;
495 while (!Worklist.
empty()) {
497 bool Changed = runOnMachineBasicBlock(
MBB,
false);
500 HazardState NewState = BlockState[&
MBB].Out;
504 auto &SuccState = BlockState[Succ];
505 if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
506 if (SuccState.In != NewState) {
507 SuccState.In = NewState;
510 }
else if (SuccState.In.merge(NewState)) {
533 AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
535 bool runOnMachineFunction(MachineFunction &MF)
override {
536 return AMDGPUWaitSGPRHazards().run(MF);
539 void getAnalysisUsage(AnalysisUsage &AU)
const override {
547char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
552 "AMDGPU Insert waits for SGPR read hazards",
false,
false)
557 if (AMDGPUWaitSGPRHazards().run(MF))
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static cl::opt< bool > GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on memory waits"))
static cl::opt< unsigned > GlobalCullSGPRHazardsMemWaitThreshold("amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " "wait"))
static cl::opt< bool > GlobalCullSGPRHazardsOnFunctionBoundary("amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on function boundaries"))
static cl::opt< bool > GlobalEnableSGPRHazardWaits("amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, cl::desc("Enable required s_wait_alu on SGPR hazards"))
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static void updateGetPCBundle(MachineInstr *NewMI)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
instr_iterator instr_begin()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isBundled() const
Return true if this instruction part of a bundle.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
int getNumOccurrences() const
self_iterator getIterator()
unsigned decodeFieldVaVcc(unsigned Encoded)
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt)
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaSsrc(unsigned Encoded)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldHoldCnt(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ Emitted
Assigned address, still materializing.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
char & AMDGPUWaitSGPRHazardsLegacyID
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
DWARFExpression::Operation Op
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.