22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
26class AMDGPUInsertDelayAlu {
40 if (
MI.getDesc().TSFlags & VA_VDST_0)
42 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
43 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
45 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
51 static bool instructionWaitsForSGPRWrites(
const MachineInstr &
MI) {
58 for (
auto &
Op :
MI.operands()) {
67 enum DelayType { VALU, TRANS, SALU, OTHER };
92 static constexpr unsigned VALU_MAX = 5;
96 static constexpr unsigned TRANS_MAX = 4;
100 static constexpr unsigned SALU_CYCLES_MAX = 4;
117 uint8_t TRANSNumVALU = VALU_MAX;
123 DelayInfo() =
default;
125 DelayInfo(DelayType
Type,
unsigned Cycles) {
134 TRANSCycles = Cycles;
141 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
147 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
148 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
149 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
157 VALUCycles = std::max(VALUCycles,
RHS.VALUCycles);
158 VALUNum = std::min(VALUNum,
RHS.VALUNum);
159 TRANSCycles = std::max(TRANSCycles,
RHS.TRANSCycles);
160 TRANSNum = std::min(TRANSNum,
RHS.TRANSNum);
161 TRANSNumVALU = std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
162 SALUCycles = std::max(SALUCycles,
RHS.SALUCycles);
168 bool advance(DelayType
Type,
unsigned Cycles) {
171 VALUNum += (
Type == VALU);
172 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
178 VALUCycles -= Cycles;
182 TRANSNum += (
Type == TRANS);
183 TRANSNumVALU += (
Type == VALU);
184 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
187 TRANSNum = TRANS_MAX;
188 TRANSNumVALU = VALU_MAX;
191 TRANSCycles -= Cycles;
195 if (SALUCycles <= Cycles) {
200 SALUCycles -= Cycles;
207#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
210 dbgs() <<
" VALUCycles=" << (int)VALUCycles;
211 if (VALUNum < VALU_MAX)
212 dbgs() <<
" VALUNum=" << (int)VALUNum;
214 dbgs() <<
" TRANSCycles=" << (int)TRANSCycles;
215 if (TRANSNum < TRANS_MAX)
216 dbgs() <<
" TRANSNum=" << (int)TRANSNum;
217 if (TRANSNumVALU < VALU_MAX)
218 dbgs() <<
" TRANSNumVALU=" << (int)TRANSNumVALU;
220 dbgs() <<
" SALUCycles=" << (int)SALUCycles;
226 struct DelayState :
DenseMap<MCRegUnit, DelayInfo> {
230 for (
const auto &KV :
RHS) {
233 std::tie(It, Inserted) = insert(KV);
235 It->second.merge(KV.second);
241 void advance(DelayType
Type,
unsigned Cycles) {
243 for (
auto I = begin(),
E = end();
I !=
E;
I =
Next) {
245 if (
I->second.advance(
Type, Cycles))
250 void advanceByVALUNum(
unsigned VALUNum) {
252 for (
auto I = begin(),
E = end();
I !=
E;
I =
Next) {
254 if (
I->second.VALUNum >= VALUNum &&
I->second.VALUCycles > 0) {
260#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
263 dbgs() <<
" empty\n";
273 return A->first <
B->first;
293 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
294 Imm |= 4 + Delay.TRANSNum;
298 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
299 Delay.VALUNum <= Delay.TRANSNumVALU) {
301 Imm |= Delay.VALUNum << 7;
303 Imm |= Delay.VALUNum;
307 if (Delay.SALUCycles) {
308 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
312 }
else if (Imm & 0xf) {
313 Imm |= (Delay.SALUCycles + 8) << 7;
315 Imm |= Delay.SALUCycles + 8;
325 if (!(Imm & 0x780) && LastDelayAlu) {
330 if (
I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
337 if (!
I->isBundle() && !
I->isMetaInstruction())
342 unsigned LastImm =
Op.getImm();
343 assert((LastImm & ~0xf) == 0 &&
344 "Remembered an s_delay_alu with no room for another delay!");
345 LastImm |= Imm << 7 | Skip << 4;
351 auto &
MBB = *
MI.getParent();
356 return (Imm & 0x780) ? nullptr : DelayAlu;
361 for (
auto *Pred :
MBB.predecessors())
362 State.merge(BlockState[Pred]);
372 MCRegUnit LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
375 for (
auto &
MI :
MBB.instrs()) {
376 if (
MI.isBundle() ||
MI.isMetaInstruction())
380 switch (
MI.getOpcode()) {
381 case AMDGPU::SI_RETURN_TO_EPILOG:
385 DelayType
Type = getDelayType(
MI);
387 if (instructionWaitsForSGPRWrites(
MI)) {
388 auto It = State.find(LastSGPRFromVALU);
389 if (It != State.end()) {
390 DelayInfo Info = It->getSecond();
391 State.advanceByVALUNum(Info.VALUNum);
393 LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
397 if (instructionWaitsForVALU(
MI)) {
400 State = DelayState();
401 }
else if (
Type != OTHER) {
404 for (
const auto &
Op :
MI.explicit_uses()) {
409 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
411 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg())) {
412 auto It = State.find(Unit);
413 if (It != State.end()) {
414 Delay.merge(It->second);
422 for (
const auto &
Op :
MI.defs()) {
425 LastSGPRFromVALU = *
TRI->regunits(
Reg).begin();
431 if (Emit && !
MI.isBundledWithPred()) {
434 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
440 for (
const auto &
Op :
MI.defs()) {
442 &
MI,
Op.getOperandNo(),
nullptr, 0);
443 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg()))
455 State.advance(
Type, Cycles);
462 "Basic block state should not have changed on final pass!");
463 }
else if (DelayState &BS = BlockState[&
MBB]; State != BS) {
464 BS = std::move(State);
475 if (!ST->hasDelayAlu())
478 SII = ST->getInstrInfo();
479 TRI = ST->getRegisterInfo();
487 while (!WorkList.
empty()) {
489 bool Changed = runOnMachineBasicBlock(
MBB,
false);
519 AMDGPUInsertDelayAlu Impl;
528 if (!AMDGPUInsertDelayAlu().
run(MF))
535char AMDGPUInsertDelayAluLegacy::ID = 0;
540 "AMDGPU Insert Delay ALU",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
Instructions::iterator instr_iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
bool isXDLWMMA(const MachineInstr &MI) const
static bool isSALU(const MachineInstr &MI)
const TargetSchedModel & getSchedModel() const
static bool isTRANS(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isVALU(const MachineInstr &MI)
A vector that has set insertion semantics.
void insert_range(Range &&R)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)
Is Reg - scalar register.
bool isDPMACCInstruction(unsigned Opc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)