Go to the documentation of this file.
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
48 if (
MI.getDesc().TSFlags & VA_VDST_0)
50 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
53 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54 (
MI.getOperand(0).getImm() & 0xf000) == 0)
60 enum DelayType { VALU,
TRANS, SALU, OTHER };
80 static const unsigned VALU_MAX = 5;
84 static const unsigned TRANS_MAX = 4;
89 uint8_t VALUCycles = 0;
90 uint8_t VALUNum = VALU_MAX;
95 uint8_t TRANSCycles = 0;
96 uint8_t TRANSNum = TRANS_MAX;
101 uint8_t TRANSNumVALU = VALU_MAX;
105 uint8_t SALUCycles = 0;
107 DelayInfo() =
default;
109 DelayInfo(DelayType
Type,
unsigned Cycles) {
118 TRANSCycles = Cycles;
129 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
130 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
131 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
141 TRANSCycles =
std::max(TRANSCycles,
RHS.TRANSCycles);
143 TRANSNumVALU =
std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
151 bool advance(DelayType
Type,
unsigned Cycles) {
154 VALUNum += (
Type == VALU);
155 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
161 VALUCycles -= Cycles;
166 TRANSNumVALU += (
Type == VALU);
167 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
170 TRANSNum = TRANS_MAX;
171 TRANSNumVALU = VALU_MAX;
174 TRANSCycles -= Cycles;
178 if (SALUCycles <= Cycles) {
183 SALUCycles -= Cycles;
190 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
193 dbgs() <<
" VALUCycles=" << (
int)VALUCycles;
194 if (VALUNum < VALU_MAX)
195 dbgs() <<
" VALUNum=" << (
int)VALUNum;
197 dbgs() <<
" TRANSCycles=" << (
int)TRANSCycles;
198 if (TRANSNum < TRANS_MAX)
199 dbgs() <<
" TRANSNum=" << (
int)TRANSNum;
200 if (TRANSNumVALU < VALU_MAX)
201 dbgs() <<
" TRANSNumVALU=" << (
int)TRANSNumVALU;
203 dbgs() <<
" SALUCycles=" << (
int)SALUCycles;
209 struct DelayState :
DenseMap<unsigned, DelayInfo> {
213 for (
const auto &KV :
RHS) {
216 std::tie(It, Inserted) = insert(KV);
218 It->second.merge(KV.second);
224 void advance(DelayType
Type,
unsigned Cycles) {
228 if (
I->second.advance(
Type, Cycles))
233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
236 dbgs() <<
" empty\n";
245 llvm::sort(Order, [](
const const_iterator &A,
const const_iterator &
B) {
246 return A->first <
B->first;
248 for (const_iterator
I : Order) {
266 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
267 Imm |= 4 + Delay.TRANSNum;
271 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
272 Delay.VALUNum <= Delay.TRANSNumVALU) {
274 Imm |= Delay.VALUNum << 7;
276 Imm |= Delay.VALUNum;
280 if (Delay.SALUCycles) {
284 }
else if (
Imm & 0xf) {
285 Imm |= (Delay.SALUCycles + 8) << 7;
287 Imm |= Delay.SALUCycles + 8;
297 if (!(
Imm & 0x780) && LastDelayAlu) {
302 if (!
I->isBundle() && !
I->isMetaInstruction())
307 unsigned LastImm =
Op.getImm();
308 assert((LastImm & ~0xf) == 0 &&
309 "Remembered an s_delay_alu with no room for another delay!");
310 LastImm |=
Imm << 7 | Skip << 4;
316 auto &
MBB = *
MI.getParent();
321 return (
Imm & 0x780) ? nullptr : DelayAlu;
327 State.merge(BlockState[Pred]);
333 bool Changed =
false;
339 if (
MI.isBundle() ||
MI.isMetaInstruction())
343 switch (
MI.getOpcode()) {
344 case AMDGPU::SI_RETURN_TO_EPILOG:
348 DelayType
Type = getDelayType(
MI.getDesc().TSFlags);
350 if (instructionWaitsForVALU(
MI)) {
352 State = DelayState();
353 }
else if (
Type != OTHER) {
356 for (
const auto &
Op :
MI.explicit_uses()) {
361 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
364 auto It = State.find(*UI);
365 if (It != State.end()) {
366 Delay.merge(It->second);
372 if (
Emit && !
MI.isBundledWithPred()) {
375 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
381 for (
const auto &
Op :
MI.defs()) {
383 &
MI,
MI.getOperandNo(&
Op),
nullptr, 0);
396 State.advance(
Type, Cycles);
403 "Basic block state should not have changed on final pass!");
404 }
else if (State != BlockState[&
MBB]) {
419 if (!
ST.hasDelayAlu())
422 SII =
ST.getInstrInfo();
423 TRI =
ST.getRegisterInfo();
432 while (!WorkList.
empty()) {
434 bool Changed = runOnMachineBasicBlock(
MBB,
false);
443 bool Changed =
false;
445 Changed |= runOnMachineBasicBlock(
MBB,
true);
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
This is an optimization pass for GlobalISel generic memory operations.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
const_iterator end(StringRef path)
Get end iterator over path.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
bool operator!=(uint64_t V1, const APInt &V2)
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const TargetRegisterInfo * TRI
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
const MachineOperand & getOperand(unsigned i) const
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Represent the analysis usage information of a pass.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
MachineOperand class - Representation of each machine instruction operand.
bool empty() const
Determine if the SetVector is empty or not.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
void sort(IteratorTy Start, IteratorTy End)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Provide an instruction scheduling machine model to CodeGen passes.
Representation of each machine instruction.
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
succ_iterator succ_begin()
bool operator==(uint64_t V1, const APInt &V2)
iterator_range< pred_iterator > predecessors()
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
bool insert(const value_type &X)
Insert a new element into the SetVector.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
void setPreservesCFG()
This function should be called by the pass, iff they do not:
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Function & getFunction()
Return the LLVM function that this machine code represents.
Iterator for intrusive lists based on ilist_node.
Instructions::iterator instr_iterator
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
auto reverse(ContainerTy &&C)
char & AMDGPUInsertDelayAluID
A vector that has set insertion semantics.
void reserve(size_type N)