22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
48 if (
MI.getDesc().TSFlags & VA_VDST_0)
50 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
53 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
60 enum DelayType { VALU, TRANS, SALU, OTHER };
63 static DelayType getDelayType(
uint64_t TSFlags) {
80 static constexpr unsigned VALU_MAX = 5;
84 static constexpr unsigned TRANS_MAX = 4;
88 static constexpr unsigned SALU_CYCLES_MAX = 4;
93 uint8_t VALUCycles = 0;
94 uint8_t VALUNum = VALU_MAX;
99 uint8_t TRANSCycles = 0;
100 uint8_t TRANSNum = TRANS_MAX;
105 uint8_t TRANSNumVALU = VALU_MAX;
109 uint8_t SALUCycles = 0;
111 DelayInfo() =
default;
113 DelayInfo(DelayType
Type,
unsigned Cycles) {
122 TRANSCycles = Cycles;
129 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
135 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
136 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
137 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
145 VALUCycles = std::max(VALUCycles,
RHS.VALUCycles);
146 VALUNum = std::min(VALUNum,
RHS.VALUNum);
147 TRANSCycles = std::max(TRANSCycles,
RHS.TRANSCycles);
148 TRANSNum = std::min(TRANSNum,
RHS.TRANSNum);
149 TRANSNumVALU = std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
150 SALUCycles = std::max(SALUCycles,
RHS.SALUCycles);
157 bool advance(DelayType
Type,
unsigned Cycles) {
160 VALUNum += (
Type == VALU);
161 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
167 VALUCycles -= Cycles;
171 TRANSNum += (
Type == TRANS);
172 TRANSNumVALU += (
Type == VALU);
173 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
176 TRANSNum = TRANS_MAX;
177 TRANSNumVALU = VALU_MAX;
180 TRANSCycles -= Cycles;
184 if (SALUCycles <= Cycles) {
189 SALUCycles -= Cycles;
196#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 dbgs() <<
" VALUCycles=" << (int)VALUCycles;
200 if (VALUNum < VALU_MAX)
201 dbgs() <<
" VALUNum=" << (int)VALUNum;
203 dbgs() <<
" TRANSCycles=" << (int)TRANSCycles;
204 if (TRANSNum < TRANS_MAX)
205 dbgs() <<
" TRANSNum=" << (int)TRANSNum;
206 if (TRANSNumVALU < VALU_MAX)
207 dbgs() <<
" TRANSNumVALU=" << (int)TRANSNumVALU;
209 dbgs() <<
" SALUCycles=" << (int)SALUCycles;
215 struct DelayState :
DenseMap<unsigned, DelayInfo> {
219 for (
const auto &KV :
RHS) {
222 std::tie(It, Inserted) = insert(KV);
224 It->second.merge(KV.second);
230 void advance(DelayType
Type,
unsigned Cycles) {
232 for (
auto I = begin(), E = end();
I != E;
I = Next) {
234 if (
I->second.advance(
Type, Cycles))
239#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
242 dbgs() <<
" empty\n";
252 return A->first <
B->first;
272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273 Imm |= 4 + Delay.TRANSNum;
277 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278 Delay.VALUNum <= Delay.TRANSNumVALU) {
280 Imm |= Delay.VALUNum << 7;
282 Imm |= Delay.VALUNum;
286 if (Delay.SALUCycles) {
287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
291 }
else if (Imm & 0xf) {
292 Imm |= (Delay.SALUCycles + 8) << 7;
294 Imm |= Delay.SALUCycles + 8;
304 if (!(Imm & 0x780) && LastDelayAlu) {
309 if (!
I->isBundle() && !
I->isMetaInstruction())
314 unsigned LastImm =
Op.getImm();
315 assert((LastImm & ~0xf) == 0 &&
316 "Remembered an s_delay_alu with no room for another delay!");
317 LastImm |= Imm << 7 | Skip << 4;
323 auto &
MBB = *
MI.getParent();
328 return (Imm & 0x780) ? nullptr : DelayAlu;
334 State.merge(BlockState[Pred]);
340 bool Changed =
false;
346 if (
MI.isBundle() ||
MI.isMetaInstruction())
350 switch (
MI.getOpcode()) {
351 case AMDGPU::SI_RETURN_TO_EPILOG:
355 DelayType
Type = getDelayType(
MI.getDesc().TSFlags);
357 if (instructionWaitsForVALU(
MI)) {
360 State = DelayState();
361 }
else if (
Type != OTHER) {
364 for (
const auto &
Op :
MI.explicit_uses()) {
369 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
372 auto It = State.find(Unit);
373 if (It != State.end()) {
374 Delay.merge(It->second);
380 if (Emit && !
MI.isBundledWithPred()) {
383 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
389 for (
const auto &
Op :
MI.defs()) {
391 &
MI,
Op.getOperandNo(),
nullptr, 0);
404 State.advance(
Type, Cycles);
411 "Basic block state should not have changed on final pass!");
412 }
else if (State != BlockState[&
MBB]) {
413 BlockState[&
MBB] = std::move(State);
427 if (!ST.hasDelayAlu())
430 SII = ST.getInstrInfo();
431 TRI = ST.getRegisterInfo();
433 SchedModel.
init(&ST);
440 while (!WorkList.
empty()) {
442 bool Changed = runOnMachineBasicBlock(
MBB,
false);
451 bool Changed =
false;
453 Changed |= runOnMachineBasicBlock(
MBB,
true);
460char AMDGPUInsertDelayAlu::ID = 0;
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
succ_iterator succ_begin()
Instructions::iterator instr_iterator
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
A vector that has set insertion semantics.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.