23#define DEBUG_TYPE "amdgpu-insert-delay-alu"
27class AMDGPUInsertDelayAlu {
41 if (
MI.getDesc().TSFlags & VA_VDST_0)
43 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
44 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
46 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
52 static bool instructionWaitsForSGPRWrites(
const MachineInstr &
MI) {
59 for (
auto &
Op :
MI.operands()) {
68 enum DelayType { VALU, TRANS, SALU, OTHER };
93 static constexpr unsigned VALU_MAX = 5;
97 static constexpr unsigned TRANS_MAX = 4;
101 static constexpr unsigned SALU_CYCLES_MAX = 4;
118 uint8_t TRANSNumVALU = VALU_MAX;
124 DelayInfo() =
default;
126 DelayInfo(DelayType
Type,
unsigned Cycles) {
135 TRANSCycles = Cycles;
142 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
148 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
149 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
150 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
158 VALUCycles = std::max(VALUCycles,
RHS.VALUCycles);
159 VALUNum = std::min(VALUNum,
RHS.VALUNum);
160 TRANSCycles = std::max(TRANSCycles,
RHS.TRANSCycles);
161 TRANSNum = std::min(TRANSNum,
RHS.TRANSNum);
162 TRANSNumVALU = std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
163 SALUCycles = std::max(SALUCycles,
RHS.SALUCycles);
169 bool advance(DelayType
Type,
unsigned Cycles) {
172 VALUNum += (
Type == VALU);
173 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
179 VALUCycles -= Cycles;
183 TRANSNum += (
Type == TRANS);
184 TRANSNumVALU += (
Type == VALU);
185 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
188 TRANSNum = TRANS_MAX;
189 TRANSNumVALU = VALU_MAX;
192 TRANSCycles -= Cycles;
196 if (SALUCycles <= Cycles) {
201 SALUCycles -= Cycles;
208#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
211 dbgs() <<
" VALUCycles=" << (int)VALUCycles;
212 if (VALUNum < VALU_MAX)
213 dbgs() <<
" VALUNum=" << (int)VALUNum;
215 dbgs() <<
" TRANSCycles=" << (int)TRANSCycles;
216 if (TRANSNum < TRANS_MAX)
217 dbgs() <<
" TRANSNum=" << (int)TRANSNum;
218 if (TRANSNumVALU < VALU_MAX)
219 dbgs() <<
" TRANSNumVALU=" << (int)TRANSNumVALU;
221 dbgs() <<
" SALUCycles=" << (int)SALUCycles;
227 struct DelayState :
DenseMap<MCRegUnit, DelayInfo> {
231 for (
const auto &KV :
RHS) {
234 std::tie(It, Inserted) = insert(KV);
236 It->second.merge(KV.second);
242 void advance(DelayType
Type,
unsigned Cycles) {
246 void advanceByVALUNum(
unsigned VALUNum) {
248 return P.second.VALUNum >= VALUNum &&
P.second.VALUCycles > 0;
252#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
255 dbgs() <<
" empty\n";
265 return A->first <
B->first;
285 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
286 Imm |= 4 + Delay.TRANSNum;
290 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
291 Delay.VALUNum <= Delay.TRANSNumVALU) {
293 Imm |= Delay.VALUNum << 7;
295 Imm |= Delay.VALUNum;
299 if (Delay.SALUCycles) {
300 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
304 }
else if (Imm & 0xf) {
305 Imm |= (Delay.SALUCycles + 8) << 7;
307 Imm |= Delay.SALUCycles + 8;
317 if (!(Imm & 0x780) && LastDelayAlu) {
322 if (
I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
329 if (!
I->isBundle() && !
I->isMetaInstruction())
334 unsigned LastImm =
Op.getImm();
335 assert((LastImm & ~0xf) == 0 &&
336 "Remembered an s_delay_alu with no room for another delay!");
337 LastImm |= Imm << 7 | Skip << 4;
343 auto &
MBB = *
MI.getParent();
348 return (Imm & 0x780) ? nullptr : DelayAlu;
353 for (
auto *Pred :
MBB.predecessors())
354 State.merge(BlockState[Pred]);
364 MCRegUnit LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
367 for (
auto &
MI :
MBB.instrs()) {
368 if (
MI.isBundle() ||
MI.isMetaInstruction())
372 switch (
MI.getOpcode()) {
373 case AMDGPU::SI_RETURN_TO_EPILOG:
377 DelayType
Type = getDelayType(
MI);
379 if (instructionWaitsForSGPRWrites(
MI)) {
380 auto It = State.find(LastSGPRFromVALU);
381 if (It != State.end()) {
382 DelayInfo Info = It->getSecond();
383 State.advanceByVALUNum(Info.VALUNum);
385 LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
389 if (instructionWaitsForVALU(
MI)) {
392 State = DelayState();
393 }
else if (
Type != OTHER) {
396 for (
const auto &
Op :
MI.explicit_uses()) {
401 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
403 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg())) {
404 auto It = State.find(Unit);
405 if (It != State.end()) {
406 Delay.merge(It->second);
414 for (
const auto &
Op :
MI.defs()) {
417 LastSGPRFromVALU = *
TRI->regunits(
Reg).begin();
423 if (Emit && !
MI.isBundledWithPred()) {
426 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
432 for (
const auto &
Op :
MI.defs()) {
434 &
MI,
Op.getOperandNo(),
nullptr, 0);
435 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg()))
447 State.advance(
Type, Cycles);
454 "Basic block state should not have changed on final pass!");
455 }
else if (DelayState &BS = BlockState[&
MBB]; State != BS) {
456 BS = std::move(State);
467 if (!ST->hasDelayAlu())
475 SII = ST->getInstrInfo();
476 TRI = ST->getRegisterInfo();
484 while (!WorkList.
empty()) {
486 bool Changed = runOnMachineBasicBlock(
MBB,
false);
516 AMDGPUInsertDelayAlu Impl;
525 if (!AMDGPUInsertDelayAlu().
run(MF))
532char AMDGPUInsertDelayAluLegacy::ID = 0;
537 "AMDGPU Insert Delay ALU",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
Instructions::iterator instr_iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
bool isXDLWMMA(const MachineInstr &MI) const
static bool isSALU(const MachineInstr &MI)
const TargetSchedModel & getSchedModel() const
static bool isTRANS(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getMaxWavesPerEU() const
A vector that has set insertion semantics.
void insert_range(Range &&R)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)
Is Reg - scalar register.
bool isDPMACCInstruction(unsigned Opc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
DWARFExpression::Operation Op
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)