LLVM 20.0.0git
AMDGPUInsertDelayAlu.cpp
Go to the documentation of this file.
1//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
17#include "SIInstrInfo.h"
18#include "llvm/ADT/SetVector.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24namespace {
25
26class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27public:
28 static char ID;
29
30 const SIInstrInfo *SII;
32
33 TargetSchedModel SchedModel;
34
35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36
37 void getAnalysisUsage(AnalysisUsage &AU) const override {
38 AU.setPreservesCFG();
40 }
41
42 // Return true if MI waits for all outstanding VALU instructions to complete.
43 static bool instructionWaitsForVALU(const MachineInstr &MI) {
44 // These instruction types wait for VA_VDST==0 before issuing.
45 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
48 if (MI.getDesc().TSFlags & VA_VDST_0)
49 return true;
50 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52 return true;
53 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54 AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
55 return true;
56 return false;
57 }
58
59 // Types of delay that can be encoded in an s_delay_alu instruction.
60 enum DelayType { VALU, TRANS, SALU, OTHER };
61
62 // Get the delay type for an instruction with the specified TSFlags.
63 static DelayType getDelayType(uint64_t TSFlags) {
64 if (TSFlags & SIInstrFlags::TRANS)
65 return TRANS;
66 if (TSFlags & SIInstrFlags::VALU)
67 return VALU;
68 if (TSFlags & SIInstrFlags::SALU)
69 return SALU;
70 return OTHER;
71 }
72
73 // Information about the last instruction(s) that wrote to a particular
74 // regunit. In straight-line code there will only be one such instruction, but
75 // when control flow converges we merge the delay information from each path
76 // to represent the union of the worst-case delays of each type.
77 struct DelayInfo {
78 // One larger than the maximum number of (non-TRANS) VALU instructions we
79 // can encode in an s_delay_alu instruction.
80 static constexpr unsigned VALU_MAX = 5;
81
82 // One larger than the maximum number of TRANS instructions we can encode in
83 // an s_delay_alu instruction.
84 static constexpr unsigned TRANS_MAX = 4;
85
86 // One larger than the maximum number of SALU cycles we can encode in an
87 // s_delay_alu instruction.
88 static constexpr unsigned SALU_CYCLES_MAX = 4;
89
90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91 // are left until it completes, and how many other (non-TRANS) VALU we have
92 // seen since it was issued.
93 uint8_t VALUCycles = 0;
94 uint8_t VALUNum = VALU_MAX;
95
96 // If it was written by a TRANS, remember how many clock cycles are left
97 // until it completes, and how many other TRANS we have seen since it was
98 // issued.
99 uint8_t TRANSCycles = 0;
100 uint8_t TRANSNum = TRANS_MAX;
101 // Also remember how many other (non-TRANS) VALU we have seen since it was
102 // issued. When an instruction depends on both a prior TRANS and a prior
103 // non-TRANS VALU, this is used to decide whether to encode a wait for just
104 // one or both of them.
105 uint8_t TRANSNumVALU = VALU_MAX;
106
107 // If it was written by an SALU, remember how many clock cycles are left
108 // until it completes.
109 uint8_t SALUCycles = 0;
110
111 DelayInfo() = default;
112
113 DelayInfo(DelayType Type, unsigned Cycles) {
114 switch (Type) {
115 default:
116 llvm_unreachable("unexpected type");
117 case VALU:
118 VALUCycles = Cycles;
119 VALUNum = 0;
120 break;
121 case TRANS:
122 TRANSCycles = Cycles;
123 TRANSNum = 0;
124 TRANSNumVALU = 0;
125 break;
126 case SALU:
127 // Guard against pseudo-instructions like SI_CALL which are marked as
128 // SALU but with a very high latency.
129 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
130 break;
131 }
132 }
133
134 bool operator==(const DelayInfo &RHS) const {
135 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
138 }
139
140 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
141
142 // Merge another DelayInfo into this one, to represent the union of the
143 // worst-case delays of each type.
144 void merge(const DelayInfo &RHS) {
145 VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
146 VALUNum = std::min(VALUNum, RHS.VALUNum);
147 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
148 TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
149 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
150 SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
151 }
152
153 // Update this DelayInfo after issuing an instruction. IsVALU should be 1
154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156 // instruction. Return true if there is no longer any useful delay info.
157 bool advance(DelayType Type, unsigned Cycles) {
158 bool Erase = true;
159
160 VALUNum += (Type == VALU);
161 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
162 // Forget about the VALU instruction. It was too far back or has
163 // definitely completed by now.
164 VALUNum = VALU_MAX;
165 VALUCycles = 0;
166 } else {
167 VALUCycles -= Cycles;
168 Erase = false;
169 }
170
171 TRANSNum += (Type == TRANS);
172 TRANSNumVALU += (Type == VALU);
173 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
174 // Forget about any TRANS instruction. It was too far back or has
175 // definitely completed by now.
176 TRANSNum = TRANS_MAX;
177 TRANSNumVALU = VALU_MAX;
178 TRANSCycles = 0;
179 } else {
180 TRANSCycles -= Cycles;
181 Erase = false;
182 }
183
184 if (SALUCycles <= Cycles) {
185 // Forget about any SALU instruction. It has definitely completed by
186 // now.
187 SALUCycles = 0;
188 } else {
189 SALUCycles -= Cycles;
190 Erase = false;
191 }
192
193 return Erase;
194 }
195
196#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
197 void dump() const {
198 if (VALUCycles)
199 dbgs() << " VALUCycles=" << (int)VALUCycles;
200 if (VALUNum < VALU_MAX)
201 dbgs() << " VALUNum=" << (int)VALUNum;
202 if (TRANSCycles)
203 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204 if (TRANSNum < TRANS_MAX)
205 dbgs() << " TRANSNum=" << (int)TRANSNum;
206 if (TRANSNumVALU < VALU_MAX)
207 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208 if (SALUCycles)
209 dbgs() << " SALUCycles=" << (int)SALUCycles;
210 }
211#endif
212 };
213
214 // A map from regunits to the delay info for that regunit.
215 struct DelayState : DenseMap<unsigned, DelayInfo> {
216 // Merge another DelayState into this one by merging the delay info for each
217 // regunit.
218 void merge(const DelayState &RHS) {
219 for (const auto &KV : RHS) {
220 iterator It;
221 bool Inserted;
222 std::tie(It, Inserted) = insert(KV);
223 if (!Inserted)
224 It->second.merge(KV.second);
225 }
226 }
227
228 // Advance the delay info for each regunit, erasing any that are no longer
229 // useful.
230 void advance(DelayType Type, unsigned Cycles) {
231 iterator Next;
232 for (auto I = begin(), E = end(); I != E; I = Next) {
233 Next = std::next(I);
234 if (I->second.advance(Type, Cycles))
235 erase(I);
236 }
237 }
238
239#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240 void dump(const TargetRegisterInfo *TRI) const {
241 if (empty()) {
242 dbgs() << " empty\n";
243 return;
244 }
245
246 // Dump DelayInfo for each RegUnit in numerical order.
248 Order.reserve(size());
249 for (const_iterator I = begin(), E = end(); I != E; ++I)
250 Order.push_back(I);
251 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
252 return A->first < B->first;
253 });
254 for (const_iterator I : Order) {
255 dbgs() << " " << printRegUnit(I->first, TRI);
256 I->second.dump();
257 dbgs() << "\n";
258 }
259 }
260#endif
261 };
262
263 // The saved delay state at the end of each basic block.
265
266 // Emit an s_delay_alu instruction if necessary before MI.
267 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268 MachineInstr *LastDelayAlu) {
269 unsigned Imm = 0;
270
271 // Wait for a TRANS instruction.
272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273 Imm |= 4 + Delay.TRANSNum;
274
275 // Wait for a VALU instruction (if it's more recent than any TRANS
276 // instruction that we're also waiting for).
277 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278 Delay.VALUNum <= Delay.TRANSNumVALU) {
279 if (Imm & 0xf)
280 Imm |= Delay.VALUNum << 7;
281 else
282 Imm |= Delay.VALUNum;
283 }
284
285 // Wait for an SALU instruction.
286 if (Delay.SALUCycles) {
287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288 if (Imm & 0x780) {
289 // We have already encoded a VALU and a TRANS delay. There's no room in
290 // the encoding for an SALU delay as well, so just drop it.
291 } else if (Imm & 0xf) {
292 Imm |= (Delay.SALUCycles + 8) << 7;
293 } else {
294 Imm |= Delay.SALUCycles + 8;
295 }
296 }
297
298 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
299 if (!Imm)
300 return LastDelayAlu;
301
302 // If we only need to wait for one instruction, try encoding it in the last
303 // s_delay_alu that we emitted.
304 if (!(Imm & 0x780) && LastDelayAlu) {
305 unsigned Skip = 0;
306 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
308 ++I != E;) {
309 if (!I->isBundle() && !I->isMetaInstruction())
310 ++Skip;
311 }
312 if (Skip < 6) {
313 MachineOperand &Op = LastDelayAlu->getOperand(0);
314 unsigned LastImm = Op.getImm();
315 assert((LastImm & ~0xf) == 0 &&
316 "Remembered an s_delay_alu with no room for another delay!");
317 LastImm |= Imm << 7 | Skip << 4;
318 Op.setImm(LastImm);
319 return nullptr;
320 }
321 }
322
323 auto &MBB = *MI.getParent();
324 MachineInstr *DelayAlu =
325 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
326 // Remember the s_delay_alu for next time if there is still room in it to
327 // encode another delay.
328 return (Imm & 0x780) ? nullptr : DelayAlu;
329 }
330
331 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332 DelayState State;
333 for (auto *Pred : MBB.predecessors())
334 State.merge(BlockState[Pred]);
335
336 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
337 << "\n";
338 State.dump(TRI););
339
340 bool Changed = false;
341 MachineInstr *LastDelayAlu = nullptr;
342
343 // Iterate over the contents of bundles, but don't emit any instructions
344 // inside a bundle.
345 for (auto &MI : MBB.instrs()) {
346 if (MI.isBundle() || MI.isMetaInstruction())
347 continue;
348
349 // Ignore some more instructions that do not generate any code.
350 switch (MI.getOpcode()) {
351 case AMDGPU::SI_RETURN_TO_EPILOG:
352 continue;
353 }
354
355 DelayType Type = getDelayType(MI.getDesc().TSFlags);
356
357 if (instructionWaitsForVALU(MI)) {
358 // Forget about all outstanding VALU delays.
359 // TODO: This is overkill since it also forgets about SALU delays.
360 State = DelayState();
361 } else if (Type != OTHER) {
362 DelayInfo Delay;
363 // TODO: Scan implicit uses too?
364 for (const auto &Op : MI.explicit_uses()) {
365 if (Op.isReg()) {
366 // One of the operands of the writelane is also the output operand.
367 // This creates the insertion of redundant delays. Hence, we have to
368 // ignore this operand.
369 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370 continue;
371 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372 auto It = State.find(Unit);
373 if (It != State.end()) {
374 Delay.merge(It->second);
375 State.erase(Unit);
376 }
377 }
378 }
379 }
380 if (Emit && !MI.isBundledWithPred()) {
381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382 // just ignore them?
383 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
384 }
385 }
386
387 if (Type != OTHER) {
388 // TODO: Scan implicit defs too?
389 for (const auto &Op : MI.defs()) {
390 unsigned Latency = SchedModel.computeOperandLatency(
391 &MI, Op.getOperandNo(), nullptr, 0);
392 for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
393 State[Unit] = DelayInfo(Type, Latency);
394 }
395 }
396
397 // Advance by the number of cycles it takes to issue this instruction.
398 // TODO: Use a more advanced model that accounts for instructions that
399 // take multiple cycles to issue on a particular pipeline.
400 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402 // instructions on the assumption that they will usually have to be issued
403 // twice?
404 State.advance(Type, Cycles);
405
406 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
407 }
408
409 if (Emit) {
410 assert(State == BlockState[&MBB] &&
411 "Basic block state should not have changed on final pass!");
412 } else if (State != BlockState[&MBB]) {
413 BlockState[&MBB] = std::move(State);
414 Changed = true;
415 }
416 return Changed;
417 }
418
419 bool runOnMachineFunction(MachineFunction &MF) override {
420 if (skipFunction(MF.getFunction()))
421 return false;
422
423 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424 << "\n");
425
426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427 if (!ST.hasDelayAlu())
428 return false;
429
430 SII = ST.getInstrInfo();
431 TRI = ST.getRegisterInfo();
432
433 SchedModel.init(&ST);
434
435 // Calculate the delay state for each basic block, iterating until we reach
436 // a fixed point.
438 for (auto &MBB : reverse(MF))
439 WorkList.insert(&MBB);
440 while (!WorkList.empty()) {
441 auto &MBB = *WorkList.pop_back_val();
442 bool Changed = runOnMachineBasicBlock(MBB, false);
443 if (Changed)
444 WorkList.insert(MBB.succ_begin(), MBB.succ_end());
445 }
446
447 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
448
449 // Make one last pass over all basic blocks to emit s_delay_alu
450 // instructions.
451 bool Changed = false;
452 for (auto &MBB : MF)
453 Changed |= runOnMachineBasicBlock(MBB, true);
454 return Changed;
455 }
456};
457
458} // namespace
459
460char AMDGPUInsertDelayAlu::ID = 0;
461
462char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
463
464INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465 false, false)
#define DEBUG_TYPE
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
Value * RHS
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:178
Instructions::iterator instr_iterator
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
MachineOperand class - Representation of each machine instruction operand.
void dump() const
Definition: Pass.cpp:136
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
void reserve(size_type N)
Definition: SmallVector.h:676
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2060
Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2090
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
char & AMDGPUInsertDelayAluID
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.