LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
18#include "llvm/ADT/Statistic.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(0, Value))
42 return O.error("'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
60 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
64 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
72 const GCNSubtarget &ST);
73
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
85 EmittedInstrs.clear();
86}
87
91
93 CurrCycleInstr = MI;
94}
95
96static bool isDivFMas(unsigned Opcode) {
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
98}
99
100static bool isSGetReg(unsigned Opcode) {
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
102}
103
104static bool isSSetReg(unsigned Opcode) {
105 switch (Opcode) {
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
110 return true;
111 }
112 return false;
113}
114
115static bool isRWLane(unsigned Opcode) {
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
117}
118
119static bool isRFE(unsigned Opcode) {
120 return Opcode == AMDGPU::S_RFE_B64;
121}
122
123static bool isSMovRel(unsigned Opcode) {
124 switch (Opcode) {
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
129 return true;
130 default:
131 return false;
132 }
133}
134
136 const MachineInstr &MI) {
137 if (TII.isAlwaysGDS(MI.getOpcode()))
138 return true;
139
140 switch (MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
144 return true;
145 // These DS opcodes don't support GDS.
146 case AMDGPU::DS_NOP:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
149 return false;
150 default:
151 if (TII.isDS(MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (MI.getOperand(GDS).getImm())
155 return true;
156 }
157 return false;
158 }
159}
160
161static bool isPermlane(const MachineInstr &MI) {
162 unsigned Opcode = MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
177}
178
179static bool isLdsDma(const MachineInstr &MI) {
180 return SIInstrInfo::isVALU(MI) &&
182}
183
184static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
185 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
186 AMDGPU::OpName::simm16);
187 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
188}
189
192 MachineInstr *MI = SU->getInstr();
193 // If we are not in "HazardRecognizerMode" and therefore not being run from
194 // the scheduler, track possible stalls from hazards but don't insert noops.
195 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
196
197 if (MI->isBundle())
198 return NoHazard;
199
200 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
204 return HazardType;
205
206 if (checkFPAtomicToDenormModeHazard(MI) > 0)
207 return HazardType;
208
209 // Hazards which cannot be mitigated with S_NOPs.
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(MI) > 0)
212 return Hazard;
213 }
214
215 if (ST.hasNoDataDepHazard())
216 return NoHazard;
217
218 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
219 return HazardType;
220
221 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
222 return HazardType;
223
224 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
225 return HazardType;
226
227 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
228 return HazardType;
229
230 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
231 return HazardType;
232
235 checkMAIVALUHazards(MI) > 0)
236 return HazardType;
237
238 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
239 return HazardType;
240
241 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
242 return HazardType;
243
244 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
245 return HazardType;
246
247 if (((ST.hasReadM0MovRelInterpHazard() &&
248 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
252 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
255 checkReadM0Hazards(MI) > 0)
256 return HazardType;
257
258 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
259 return HazardType;
260
262 checkMAILdStHazards(MI) > 0)
263 return HazardType;
264
265 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
266 return HazardType;
267
268 return NoHazard;
269}
270
272 unsigned Quantity) {
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
275 Quantity -= Arg;
276 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
277 .addImm(Arg - 1);
278 }
279}
280
281unsigned
282GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
287}
288
289void GCNHazardRecognizer::processBundle() {
290 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
291 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
292 // Check bundled MachineInstr's for hazards.
293 for (; MI != E && MI->isInsideBundle(); ++MI) {
294 CurrCycleInstr = &*MI;
295 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
296
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
299
300 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
301 }
302
303 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
304 // include the bundled MI directly after, only add a maximum of
305 // (MaxLookAhead - 1) noops to EmittedInstrs.
306 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
307 EmittedInstrs.push_front(nullptr);
308
309 EmittedInstrs.push_front(CurrCycleInstr);
310 EmittedInstrs.resize(MaxLookAhead);
311 }
312 CurrCycleInstr = nullptr;
313}
314
315void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
316 assert(IsHazardRecognizerMode);
317
318 unsigned NumPreNoops = PreEmitNoops(MI);
319 EmitNoops(NumPreNoops);
320 if (MI->isInsideBundle())
321 insertNoopsInBundle(MI, TII, NumPreNoops);
322 else
323 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
324 NumPreNoops);
326 AdvanceCycle();
327}
328
330 IsHazardRecognizerMode = true;
331 CurrCycleInstr = MI;
332 unsigned W = PreEmitNoopsCommon(MI);
333 fixHazards(MI);
334 CurrCycleInstr = nullptr;
335 return std::max(W, NopPadding.getValue());
336}
337
341
343 if (MI->isBundle())
344 return 0;
345
346 int WaitStates = 0;
347
349 return std::max(WaitStates, checkSMRDHazards(MI));
350
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
353
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
355
356 if (ST.hasNoDataDepHazard())
357 return WaitStates;
358
360 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
361
363 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
364
366 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
367
368 if (isDivFMas(MI->getOpcode()))
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
370
371 if (isRWLane(MI->getOpcode()))
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
373
376 checkMAIVALUHazards(MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
378
379 if (MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(MI));
381
382 if (isSGetReg(MI->getOpcode()))
383 return std::max(WaitStates, checkGetRegHazards(MI));
384
385 if (isSSetReg(MI->getOpcode()))
386 return std::max(WaitStates, checkSetRegHazards(MI));
387
388 if (isRFE(MI->getOpcode()))
389 return std::max(WaitStates, checkRFEHazards(MI));
390
391 if ((ST.hasReadM0MovRelInterpHazard() &&
392 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
395 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
396 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(MI));
400
402 return std::max(WaitStates, checkMAIHazards(MI));
403
405 return std::max(WaitStates, checkMAILdStHazards(MI));
406
407 if (ST.hasGFX950Insts() && isPermlane(*MI))
408 return std::max(WaitStates, checkPermlaneHazards(MI));
409
410 return WaitStates;
411}
412
414 EmittedInstrs.push_front(nullptr);
415}
416
418 // When the scheduler detects a stall, it will call AdvanceCycle() without
419 // emitting any instructions.
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(nullptr);
422 return;
423 }
424
425 if (CurrCycleInstr->isBundle()) {
426 processBundle();
427 return;
428 }
429
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr = nullptr;
433 return;
434 }
435
436 // Keep track of emitted instructions
437 EmittedInstrs.push_front(CurrCycleInstr);
438
439 // Add a nullptr for each additional wait state after the first. Make sure
440 // not to add more than getMaxLookAhead() items to the list, since we
441 // truncate the list to that size right after this loop.
442 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
443 i < e; ++i) {
444 EmittedInstrs.push_front(nullptr);
445 }
446
447 // getMaxLookahead() is the largest number of wait states we will ever need
448 // to insert, so there is no point in keeping track of more than that many
449 // wait states.
450 EmittedInstrs.resize(getMaxLookAhead());
451
452 CurrCycleInstr = nullptr;
453}
454
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
458}
459
460//===----------------------------------------------------------------------===//
461// Helper Functions
462//===----------------------------------------------------------------------===//
463
465
466// Search for a hazard in a block and its predecessors.
467template <typename StateT>
468static bool
469hasHazard(StateT InitialState,
470 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
471 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
472 const MachineBasicBlock *InitialMBB,
474 struct StateMapKey {
476 unsigned Idx;
477 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
478 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
479 }
480 };
481 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
482 static unsigned getHashValue(const StateMapKey &Key) {
483 return StateT::getHashValue((*Key.States)[Key.Idx]);
484 }
485 static unsigned getHashValue(const StateT &State) {
486 return StateT::getHashValue(State);
487 }
488 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
489 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
490 }
491 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
492 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
493 }
494 };
495
498
500 const MachineBasicBlock *MBB = InitialMBB;
501 StateT State = InitialState;
502
504 unsigned WorkIdx = 0;
505 for (;;) {
506 bool Expired = false;
507 for (auto E = MBB->instr_rend(); I != E; ++I) {
508 // No need to look at parent BUNDLE instructions.
509 if (I->isBundle())
510 continue;
511
512 auto Result = IsHazard(State, *I);
513 if (Result == HazardFound)
514 return true;
515 if (Result == HazardExpired) {
516 Expired = true;
517 break;
518 }
519
520 if (I->isInlineAsm() || I->isMetaInstruction())
521 continue;
522
523 UpdateState(State, *I);
524 }
525
526 if (!Expired) {
527 unsigned StateIdx = States.size();
528 StateMapKey Key = {&States, StateIdx};
529 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
530 if (Insertion.second) {
531 States.emplace_back(State);
532 } else {
533 StateIdx = Insertion.first->second;
534 }
535 for (MachineBasicBlock *Pred : MBB->predecessors())
536 Worklist.insert(std::pair(Pred, StateIdx));
537 }
538
539 if (WorkIdx == Worklist.size())
540 break;
541
542 unsigned StateIdx;
543 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
544 State = States[StateIdx];
545 I = MBB->instr_rbegin();
546 }
547
548 return false;
549}
550
551// Returns a minimum wait states since \p I walking all predecessors.
552// Only scans until \p IsExpired does not return true.
553// Can only be run in a hazard recognizer mode.
554static int
556 const MachineBasicBlock *MBB,
558 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
562 for (auto E = MBB->instr_rend(); I != E; ++I) {
563 // Don't add WaitStates for parent BUNDLE instructions.
564 if (I->isBundle())
565 continue;
566
567 if (IsHazard(*I))
568 return WaitStates;
569
570 if (I->isInlineAsm())
571 continue;
572
573 WaitStates += GetNumWaitStates(*I);
574
575 if (IsExpired(*I, WaitStates))
576 return std::numeric_limits<int>::max();
577 }
578
579 int MinWaitStates = std::numeric_limits<int>::max();
580 for (MachineBasicBlock *Pred : MBB->predecessors()) {
581 if (!Visited.insert(Pred).second)
582 continue;
583
584 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
585 IsExpired, Visited, GetNumWaitStates);
586
587 MinWaitStates = std::min(MinWaitStates, W);
588 }
589
590 return MinWaitStates;
591}
592
593static int
595 const MachineInstr *MI,
600 return getWaitStatesSince(IsHazard, MI->getParent(),
601 std::next(MI->getReverseIterator()), 0, IsExpired,
602 Visited, GetNumWaitStates);
603}
604
605int GCNHazardRecognizer::getWaitStatesSince(
606 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
607 if (IsHazardRecognizerMode) {
608 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
609 return WaitStates >= Limit;
610 };
611 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
612 GetNumWaitStates);
613 }
614
615 int WaitStates = 0;
616 for (MachineInstr *MI : EmittedInstrs) {
617 if (MI) {
618 if (IsHazard(*MI))
619 return WaitStates;
620
621 if (MI->isInlineAsm())
622 continue;
623 }
624 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
625
626 if (WaitStates >= Limit)
627 break;
628 }
629 return std::numeric_limits<int>::max();
630}
631
632int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
633 int Limit) const {
634 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
635}
636
637int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
638 IsHazardFn IsHazardDef,
639 int Limit) const {
640 const SIRegisterInfo *TRI = ST.getRegisterInfo();
641
642 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
643 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
644 };
645
646 return getWaitStatesSince(IsHazardFn, Limit);
647}
648
649int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
650 int Limit) const {
651 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
652 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
653 };
654
655 return getWaitStatesSince(IsHazardFn, Limit);
656}
657
658//===----------------------------------------------------------------------===//
659// No-op Hazard Detection
660//===----------------------------------------------------------------------===//
661
662static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
663 MCRegister Reg) {
664 for (MCRegUnit Unit : TRI.regunits(Reg))
665 BV.set(static_cast<unsigned>(Unit));
666}
667
668static void addRegsToSet(const SIRegisterInfo &TRI,
670 BitVector &DefSet, BitVector &UseSet) {
671 for (const MachineOperand &Op : Ops) {
672 if (Op.isReg())
673 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
674 }
675}
676
677void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
678 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
679}
680
682 return !SIInstrInfo::isSMRD(*MI);
683}
684
686 return !SIInstrInfo::isVMEM(*MI);
687}
688
689int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
690 // SMEM soft clause are only present on VI+, and only matter if xnack is
691 // enabled.
692 if (!ST.isXNACKEnabled())
693 return 0;
694
695 bool IsSMRD = TII.isSMRD(*MEM);
696
697 resetClause();
698
699 // A soft-clause is any group of consecutive SMEM instructions. The
700 // instructions in this group may return out of order and/or may be
701 // replayed (i.e. the same instruction issued more than once).
702 //
703 // In order to handle these situations correctly we need to make sure that
704 // when a clause has more than one instruction, no instruction in the clause
705 // writes to a register that is read by another instruction in the clause
706 // (including itself). If we encounter this situation, we need to break the
707 // clause by inserting a non SMEM instruction.
708
709 for (MachineInstr *MI : EmittedInstrs) {
710 // When we hit a non-SMEM instruction then we have passed the start of the
711 // clause and we can stop.
712 if (!MI)
713 break;
714
716 break;
717
718 addClauseInst(*MI);
719 }
720
721 if (ClauseDefs.none())
722 return 0;
723
724 // We need to make sure not to put loads and stores in the same clause if they
725 // use the same address. For now, just start a new clause whenever we see a
726 // store.
727 if (MEM->mayStore())
728 return 1;
729
730 addClauseInst(*MEM);
731
732 // If the set of defs and uses intersect then we cannot add this instruction
733 // to the clause, so we have a hazard.
734 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
735}
736
737int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
738 int WaitStatesNeeded = 0;
739
740 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
741
742 // This SMRD hazard only affects SI.
743 if (!ST.hasSMRDReadVALUDefHazard())
744 return WaitStatesNeeded;
745
746 // A read of an SGPR by SMRD instruction requires 4 wait states when the
747 // SGPR was written by a VALU instruction.
748 int SmrdSgprWaitStates = 4;
749 auto IsHazardDefFn = [this](const MachineInstr &MI) {
750 return TII.isVALU(MI);
751 };
752 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
753 return TII.isSALU(MI);
754 };
755
756 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
757
758 for (const MachineOperand &Use : SMRD->uses()) {
759 if (!Use.isReg())
760 continue;
761 int WaitStatesNeededForUse =
762 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
763 SmrdSgprWaitStates);
764 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
765
766 // This fixes what appears to be undocumented hardware behavior in SI where
767 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
768 // needs some number of nops in between. We don't know how many we need, but
769 // let's use 4. This wasn't discovered before probably because the only
770 // case when this happens is when we expand a 64-bit pointer into a full
771 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
772 // probably never encountered in the closed-source land.
773 if (IsBufferSMRD) {
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
776 IsBufferHazardDefFn,
777 SmrdSgprWaitStates);
778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
779 }
780 }
781
782 return WaitStatesNeeded;
783}
784
785int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
786 if (!ST.hasVMEMReadSGPRVALUDefHazard())
787 return 0;
788
789 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
790
791 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
792 // SGPR was written by a VALU Instruction.
793 const int VmemSgprWaitStates = 5;
794 auto IsHazardDefFn = [this](const MachineInstr &MI) {
795 return TII.isVALU(MI);
796 };
797 for (const MachineOperand &Use : VMEM->uses()) {
798 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
799 continue;
800
801 int WaitStatesNeededForUse =
802 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
803 VmemSgprWaitStates);
804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
805 }
806 return WaitStatesNeeded;
807}
808
809int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
810 const SIRegisterInfo *TRI = ST.getRegisterInfo();
811 const SIInstrInfo *TII = ST.getInstrInfo();
812
813 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
814 int DppVgprWaitStates = 2;
815 int DppExecWaitStates = 5;
816 int WaitStatesNeeded = 0;
817 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
818 return TII->isVALU(MI);
819 };
820
821 for (const MachineOperand &Use : DPP->uses()) {
822 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
823 continue;
824 int WaitStatesNeededForUse =
825 DppVgprWaitStates - getWaitStatesSinceDef(
826 Use.getReg(),
827 [](const MachineInstr &) { return true; },
828 DppVgprWaitStates);
829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
830 }
831
832 WaitStatesNeeded = std::max(
833 WaitStatesNeeded,
834 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
835 DppExecWaitStates));
836
837 return WaitStatesNeeded;
838}
839
840int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
841 const SIInstrInfo *TII = ST.getInstrInfo();
842
843 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
844 // instruction.
845 const int DivFMasWaitStates = 4;
846 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
847 return TII->isVALU(MI);
848 };
849 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
850 DivFMasWaitStates);
851
852 return DivFMasWaitStates - WaitStatesNeeded;
853}
854
855int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
856 const SIInstrInfo *TII = ST.getInstrInfo();
857 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
858
859 const int GetRegWaitStates = 2;
860 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
861 return GetRegHWReg == getHWReg(TII, MI);
862 };
863 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
864
865 return GetRegWaitStates - WaitStatesNeeded;
866}
867
868int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned HWReg = getHWReg(TII, *SetRegInstr);
871
872 const int SetRegWaitStates = ST.getSetRegWaitStates();
873 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
874 return HWReg == getHWReg(TII, MI);
875 };
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
877 return SetRegWaitStates - WaitStatesNeeded;
878}
879
880int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
881 if (!MI.mayStore())
882 return -1;
883
884 const SIInstrInfo *TII = ST.getInstrInfo();
885 unsigned Opcode = MI.getOpcode();
886 const MCInstrDesc &Desc = MI.getDesc();
887
888 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
889 int VDataRCID = -1;
890 if (VDataIdx != -1)
891 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
892
893 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
894 // There is no hazard if the instruction does not use vector regs
895 // (like wbinvl1)
896 if (VDataIdx == -1)
897 return -1;
898 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
899 // On gfx940-family the BUFFER_STORE source-vgpr WAR hazard exists for
900 // every SOFFSET shape; the wait-state count differs by SOFFSET, and is
901 // computed in checkVALUHazardsHelper. Pre-gfx940 the hazard only exists
902 // if soffset is not an SGPR.
903 if (ST.hasGFX940Insts())
904 return VDataIdx;
905 const MachineOperand *SOffset =
906 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
907 if (!SOffset || !SOffset->isReg())
908 return VDataIdx;
909 }
910 }
911
912 // MIMG instructions create a hazard if they don't use a 256-bit T# and
913 // the store size is greater than 8 bytes and they have more than two bits
914 // of their dmask set.
915 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
916 if (TII->isMIMG(MI)) {
917 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
918 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
919 Desc.operands()[SRsrcIdx])) == 256);
920 (void)SRsrcIdx;
921 }
922
923 if (TII->isFLAT(MI)) {
924 // There is no hazard if the instruction does not use vector regs
925 if (VDataIdx == -1)
926 return -1;
927
928 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
929 return VDataIdx;
930 }
931
932 return -1;
933}
934
935int GCNHazardRecognizer::checkVALUHazardsHelper(
936 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
937 // Helper to check for the hazard where VMEM instructions that store more
938 // than 8 bytes can have their store data overwritten by the next
939 // instruction. On gfx940-family the window depends on the producer's
940 // SOFFSET shape:
941 // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
942 // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
943 // store: 2 wait states.
944 // Pre-gfx940 keeps a single 1-wait-state window. The 1-cycle sgpr-SOFFSET
945 // window was measured on gfx950 (MI350X); the same gate is applied to the
946 // rest of the gfx940 family to match the existing rule's granularity.
947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
948 const SIInstrInfo *TII = ST.getInstrInfo();
949
950 int WaitStatesNeeded = 0;
951 if (!TRI->isVectorRegister(MRI, Def.getReg()))
952 return WaitStatesNeeded;
953 const Register Reg = Def.getReg();
954
955 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
956
957 // Per-producer required wait-state window. On pre-gfx940 every producer
958 // uses 1; on gfx940-family MUBUF/MTBUF stores with an SGPR SOFFSET use 1
959 // and everything else (literal/absent SOFFSET, FLAT) uses 2.
960 auto WindowFor = [this, TII](const MachineInstr &MI) -> int {
961 if (!ST.hasGFX940Insts())
962 return 1;
963 if (TII->isBUF(MI)) {
964 const MachineOperand *SOffset =
965 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
966 if (SOffset && SOffset->isReg())
967 return 1;
968 }
969 return 2;
970 };
971
972 // For each hazard producer reached, accumulate the wait states still
973 // needed using that producer's own window. The predicate always returns
974 // false so the walk runs to MaxWaitStates.
975 int Distance = 0;
976 auto Counter = [&](const MachineInstr &MI) {
977 int DataIdx = createsVALUHazard(MI);
978 if (DataIdx >= 0 &&
979 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg)) {
980 int Need = WindowFor(MI) - Distance;
981 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
982 }
983 // Mirror getWaitStatesSince's accounting, which does not count inline asm
984 // towards the wait-state distance.
985 if (!MI.isInlineAsm())
987 return false;
988 };
989 getWaitStatesSince(Counter, MaxWaitStates);
990
991 return WaitStatesNeeded;
992}
993
994/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
995/// pack the computed value into correct bit position of the dest register. This
996/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
997/// dst_sel that is not aligned to the register. This function analayzes the \p
998/// MI and \returns an operand with dst forwarding issue, or nullptr if
999/// none exists.
1000static const MachineOperand *
1002 if (!SIInstrInfo::isVALU(MI))
1003 return nullptr;
1004
1005 const SIInstrInfo *TII = ST.getInstrInfo();
1006
1007 unsigned Opcode = MI.getOpcode();
1008
1009 // There are three different types of instructions
1010 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
1011 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
1012 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
1013 // op_sel[3:2]
1014 // != 0
1015 if (SIInstrInfo::isSDWA(MI)) {
1016 // Type 1: SDWA with dst_sel != DWORD
1017 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
1018 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1019 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1020 }
1021
1022 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1023 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1024 // Type 2: VOP3 which write the hi bits
1025 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1027 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1028
1029 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1030 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1031 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1033 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1034 }
1035
1036 // Special case: nop is required for all the opsel values for fp4 sr variant
1037 // cvt scale instructions
1038 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1039 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1040
1041 return nullptr;
1042}
1043
1044/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1045/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1046/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1048 const MachineOperand *Dst,
1049 const SIRegisterInfo *TRI) {
1050 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1051 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1052 // and we must account for that hazard.
1053 // We also must account for WAW hazards. In particular, WAW with dest
1054 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1055 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1056 // check for ECC. Without accounting for this hazard, the ECC will be
1057 // wrong.
1058 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1059 // complete zeroesHigh16BitsOfDest)
1060 for (auto &Operand : VALU->operands()) {
1061 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1062 return true;
1063 }
1064 }
1065 return false;
1066}
1067
1068int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1069 int WaitStatesNeeded = 0;
1070
1071 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1072 const int TransDefWaitstates = 1;
1073
1074 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1076 return false;
1077 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1080
1081 for (const MachineOperand &Use : VALU->explicit_uses()) {
1082 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1083 return true;
1084 }
1085
1086 return false;
1087 };
1088
1089 int WaitStatesNeededForDef =
1090 TransDefWaitstates -
1091 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1093 }
1094
1095 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1096 const int Shift16DefWaitstates = 1;
1097
1098 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1100 const MachineOperand *ForwardedDst =
1101 getDstSelForwardingOperand(ProducerMI, ST);
1102 if (ForwardedDst) {
1103 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1104 }
1105
1106 if (ProducerMI.isInlineAsm()) {
1107 // Assume inline asm has dst forwarding hazard
1108 for (auto &Def : ProducerMI.all_defs()) {
1109 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1110 return true;
1111 }
1112 }
1113
1114 return false;
1115 };
1116
1117 int WaitStatesNeededForDef =
1118 Shift16DefWaitstates -
1119 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1121 }
1122
1123 if (ST.hasVDecCoExecHazard()) {
1124 const int VALUWriteSGPRVALUReadWaitstates = 2;
1125 const int VALUWriteEXECRWLane = 4;
1126 const int VALUWriteVGPRReadlaneRead = 1;
1127
1128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1129 const MachineRegisterInfo &MRI = MF.getRegInfo();
1131 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1132 if (!SIInstrInfo::isVALU(MI))
1133 return false;
1134 return MI.modifiesRegister(UseReg, TRI);
1135 };
1136
1137 for (const MachineOperand &Use : VALU->explicit_uses()) {
1138 if (!Use.isReg())
1139 continue;
1140
1141 UseReg = Use.getReg();
1142 if (TRI->isSGPRReg(MRI, UseReg)) {
1143 int WaitStatesNeededForDef =
1144 VALUWriteSGPRVALUReadWaitstates -
1145 getWaitStatesSince(IsVALUDefSGPRFn,
1146 VALUWriteSGPRVALUReadWaitstates);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1148 }
1149 }
1150
1151 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1152 UseReg = AMDGPU::VCC;
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1157 }
1158
1159 switch (VALU->getOpcode()) {
1160 case AMDGPU::V_READLANE_B32:
1161 case AMDGPU::V_READFIRSTLANE_B32: {
1162 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1163 UseReg = Src->getReg();
1164 int WaitStatesNeededForDef =
1165 VALUWriteVGPRReadlaneRead -
1166 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1168 }
1169 [[fallthrough]];
1170 case AMDGPU::V_WRITELANE_B32: {
1171 UseReg = AMDGPU::EXEC;
1172 int WaitStatesNeededForDef =
1173 VALUWriteEXECRWLane -
1174 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1175 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1176 break;
1177 }
1178 default:
1179 break;
1180 }
1181 }
1182
1183 // This checks for the hazard where VMEM instructions that store more than
1184 // 8 bytes can have there store data over written by the next instruction.
1185 if (!ST.has12DWordStoreHazard())
1186 return WaitStatesNeeded;
1187
1188 const MachineRegisterInfo &MRI = MF.getRegInfo();
1189
1190 for (const MachineOperand &Def : VALU->defs()) {
1191 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1192 }
1193
1194 return WaitStatesNeeded;
1195}
1196
1197int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1198 // This checks for hazards associated with inline asm statements.
1199 // Since inline asms can contain just about anything, we use this
1200 // to call/leverage other check*Hazard routines. Note that
1201 // this function doesn't attempt to address all possible inline asm
1202 // hazards (good luck), but is a collection of what has been
1203 // problematic thus far.
1204
1205 // see checkVALUHazards()
1206 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1207 !ST.hasCvtScaleForwardingHazard())
1208 return 0;
1209
1210 const MachineRegisterInfo &MRI = MF.getRegInfo();
1211 int WaitStatesNeeded = 0;
1212
1213 for (const MachineOperand &Op :
1215 if (Op.isReg() && Op.isDef()) {
1216 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1217 continue;
1218
1219 if (ST.has12DWordStoreHazard()) {
1220 WaitStatesNeeded =
1221 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1222 }
1223 }
1224 }
1225
1226 if (ST.hasDstSelForwardingHazard()) {
1227 const int Shift16DefWaitstates = 1;
1228
1229 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1230 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1231 // Assume inline asm reads the dst
1232 if (Dst)
1233 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1234 IA->readsRegister(Dst->getReg(), &TRI);
1235
1236 if (ProducerMI.isInlineAsm()) {
1237 // If MI is inline asm, assume it has dst forwarding hazard
1238 for (auto &Def : ProducerMI.all_defs()) {
1239 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1240 IA->readsRegister(Def.getReg(), &TRI)) {
1241 return true;
1242 }
1243 }
1244 }
1245
1246 return false;
1247 };
1248
1249 int WaitStatesNeededForDef =
1250 Shift16DefWaitstates -
1251 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1252 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1253 }
1254
1255 return WaitStatesNeeded;
1256}
1257
1258int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1259 const SIInstrInfo *TII = ST.getInstrInfo();
1260 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1261 const MachineRegisterInfo &MRI = MF.getRegInfo();
1262
1263 const MachineOperand *LaneSelectOp =
1264 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1265
1266 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1267 return 0;
1268
1269 Register LaneSelectReg = LaneSelectOp->getReg();
1270 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1271
1272 const int RWLaneWaitStates = 4;
1273 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1274 RWLaneWaitStates);
1275 return RWLaneWaitStates - WaitStatesSince;
1276}
1277
1278int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1279 if (!ST.hasRFEHazards())
1280 return 0;
1281
1282 const SIInstrInfo *TII = ST.getInstrInfo();
1283
1284 const int RFEWaitStates = 1;
1285
1286 auto IsHazardFn = [TII](const MachineInstr &MI) {
1287 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1288 };
1289 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1290 return RFEWaitStates - WaitStatesNeeded;
1291}
1292
1293int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1294 const SIInstrInfo *TII = ST.getInstrInfo();
1295 const int ReadM0WaitStates = 1;
1296 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1297 return ReadM0WaitStates -
1298 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1299}
1300
1301void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1303 int WaitStatesNeeded, bool IsHoisting) {
1304 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1305 for (int I = 0; I < WaitStatesNeeded; ++I)
1306 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1307}
1308
1309void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1310 fixVMEMtoScalarWriteHazards(MI);
1311 fixVcmpxPermlaneHazards(MI);
1312 fixSMEMtoVectorWriteHazards(MI);
1313 fixVcmpxExecWARHazard(MI);
1314 fixLdsBranchVmemWARHazard(MI);
1315 if (ST.hasLdsDirect()) {
1316 fixLdsDirectVALUHazard(MI);
1317 fixLdsDirectVMEMHazard(MI);
1318 }
1319 fixVALUPartialForwardingHazard(MI);
1320 fixVALUTransUseHazard(MI);
1321 fixVALUTransCoexecutionHazards(MI);
1322 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1323 fixWMMACoexecutionHazards(MI);
1324 fixShift64HighRegBug(MI);
1325 fixVALUMaskWriteHazard(MI);
1326 fixRequiredExportPriority(MI);
1327 if (ST.requiresWaitIdleBeforeGetReg())
1328 fixGetRegWaitIdle(MI);
1329 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1330 fixDsAtomicAsyncBarrierArriveB64(MI);
1331 if (ST.hasScratchBaseForwardingHazard())
1332 fixScratchBaseForwardingHazard(MI);
1333 if (ST.setRegModeNeedsVNOPs())
1334 fixSetRegMode(MI);
1335}
1336
1338 const MachineInstr &MI) {
1339 return (TII.isVOPC(MI) ||
1340 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1341 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1342}
1343
1344bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1345 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1346 return false;
1347
1348 const SIInstrInfo *TII = ST.getInstrInfo();
1349 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1350 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1351 return isVCmpXWritesExec(*TII, *TRI, MI);
1352 };
1353
1354 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1355 unsigned Opc = MI.getOpcode();
1356 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1357 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1358 };
1359
1360 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1361 std::numeric_limits<int>::max())
1362 return false;
1363
1364 // V_NOP will be discarded by SQ.
1365 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1366 // which is always a VGPR and available.
1367 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1368 Register Reg = Src0->getReg();
1369 bool IsUndef = Src0->isUndef();
1370 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1371 TII->get(AMDGPU::V_MOV_B32_e32))
1374
1375 return true;
1376}
1377
1378bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1379 if (!ST.hasVMEMtoScalarWriteHazard())
1380 return false;
1381 assert(!ST.hasExtendedWaitCounts());
1382
1384 return false;
1385
1386 if (MI->getNumDefs() == 0)
1387 return false;
1388
1389 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1390
1391 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1393 return false;
1394
1395 for (const MachineOperand &Def : MI->defs()) {
1396 const MachineOperand *Op =
1397 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1398 if (!Op)
1399 continue;
1400 return true;
1401 }
1402 return false;
1403 };
1404
1405 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1406 return SIInstrInfo::isVALU(MI) ||
1407 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1408 !MI.getOperand(0).getImm()) ||
1409 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1410 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1411 };
1412
1413 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1414 std::numeric_limits<int>::max())
1415 return false;
1416
1417 const SIInstrInfo *TII = ST.getInstrInfo();
1418 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1419 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1421 return true;
1422}
1423
1424bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1425 if (!ST.hasSMEMtoVectorWriteHazard())
1426 return false;
1427 assert(!ST.hasExtendedWaitCounts());
1428
1429 if (!SIInstrInfo::isVALU(*MI))
1430 return false;
1431
1432 AMDGPU::OpName SDSTName;
1433 switch (MI->getOpcode()) {
1434 case AMDGPU::V_READLANE_B32:
1435 case AMDGPU::V_READFIRSTLANE_B32:
1436 SDSTName = AMDGPU::OpName::vdst;
1437 break;
1438 default:
1439 SDSTName = AMDGPU::OpName::sdst;
1440 break;
1441 }
1442
1443 const SIInstrInfo *TII = ST.getInstrInfo();
1444 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1445 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1446 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1447 if (!SDST) {
1448 for (const auto &MO : MI->implicit_operands()) {
1449 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1450 SDST = &MO;
1451 break;
1452 }
1453 }
1454 }
1455
1456 if (!SDST)
1457 return false;
1458
1459 const Register SDSTReg = SDST->getReg();
1460 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1461 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1462 };
1463
1464 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1465 if (TII->isSALU(MI)) {
1466 switch (MI.getOpcode()) {
1467 case AMDGPU::S_SETVSKIP:
1468 case AMDGPU::S_VERSION:
1469 case AMDGPU::S_WAITCNT_VSCNT:
1470 case AMDGPU::S_WAITCNT_VMCNT:
1471 case AMDGPU::S_WAITCNT_EXPCNT:
1472 // These instructions cannot not mitigate the hazard.
1473 return false;
1474 case AMDGPU::S_WAITCNT_LGKMCNT:
1475 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1476 return (MI.getOperand(1).getImm() == 0) &&
1477 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1478 case AMDGPU::S_WAITCNT: {
1479 const int64_t Imm = MI.getOperand(0).getImm();
1480 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1481 // DsCnt corresponds to LGKMCnt here.
1482 return Decoded.get(AMDGPU::DS_CNT) == 0;
1483 }
1484 default:
1485 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1486 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1487 "unexpected wait count instruction");
1488 // SOPP instructions cannot mitigate the hazard.
1489 if (TII->isSOPP(MI))
1490 return false;
1491 // At this point the SALU can be assumed to mitigate the hazard
1492 // because either:
1493 // (a) it is independent of the at risk SMEM (breaking chain),
1494 // or
1495 // (b) it is dependent on the SMEM, in which case an appropriate
1496 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1497 // SMEM instruction.
1498 return true;
1499 }
1500 }
1501 return false;
1502 };
1503
1504 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1505 std::numeric_limits<int>::max())
1506 return false;
1507
1508 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1509 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1510 .addImm(0);
1511 return true;
1512}
1513
1514bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1515 if (!ST.hasVcmpxExecWARHazard())
1516 return false;
1517 assert(!ST.hasExtendedWaitCounts());
1518
1519 if (!SIInstrInfo::isVALU(*MI))
1520 return false;
1521
1522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1523 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1524 return false;
1525
1526 auto IsHazardFn = [TRI](const MachineInstr &I) {
1528 return false;
1529 return I.readsRegister(AMDGPU::EXEC, TRI);
1530 };
1531
1532 const SIInstrInfo *TII = ST.getInstrInfo();
1533 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1534 if (SIInstrInfo::isVALU(MI)) {
1535 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1536 return true;
1537 for (auto MO : MI.implicit_operands())
1538 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1539 return true;
1540 }
1541 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1542 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1543 return true;
1544 return false;
1545 };
1546
1547 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1548 std::numeric_limits<int>::max())
1549 return false;
1550
1551 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1552 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1554 return true;
1555}
1556
1558 const GCNSubtarget &ST) {
1559 if (!ST.hasLdsBranchVmemWARHazard())
1560 return false;
1561
1562 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1563 // instructions need to appear in the same function.
1564 bool HasLds = false;
1565 bool HasVmem = false;
1566 for (auto &MBB : MF) {
1567 for (auto &MI : MBB) {
1569 HasVmem |= SIInstrInfo::isVMEM(MI);
1570 if (HasLds && HasVmem)
1571 return true;
1572 }
1573 }
1574 return false;
1575}
1576
1578 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1580 !I.getOperand(1).getImm();
1581}
1582
1583bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1584 if (!RunLdsBranchVmemWARHazardFixup)
1585 return false;
1586
1587 assert(ST.hasLdsBranchVmemWARHazard());
1588 assert(!ST.hasExtendedWaitCounts());
1589
1590 auto IsHazardInst = [](const MachineInstr &MI) {
1592 return 1;
1594 return 2;
1595 return 0;
1596 };
1597
1598 auto InstType = IsHazardInst(*MI);
1599 if (!InstType)
1600 return false;
1601
1602 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1603 return IsHazardInst(I) || isStoreCountWaitZero(I);
1604 };
1605
1606 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1607 if (!I.isBranch())
1608 return false;
1609
1610 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1611 auto InstType2 = IsHazardInst(I);
1612 return InstType2 && InstType != InstType2;
1613 };
1614
1615 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1616 auto InstType2 = IsHazardInst(I);
1617 if (InstType == InstType2)
1618 return true;
1619
1620 return isStoreCountWaitZero(I);
1621 };
1622
1623 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1624 std::numeric_limits<int>::max();
1625 };
1626
1627 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1628 std::numeric_limits<int>::max())
1629 return false;
1630
1631 const SIInstrInfo *TII = ST.getInstrInfo();
1632 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1633 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1634 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1635 .addImm(0);
1636
1637 return true;
1638}
1639
1640bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1642 return false;
1643
1644 const int NoHazardWaitStates = 15;
1645 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1646 const Register VDSTReg = VDST->getReg();
1647
1648 bool VisitedTrans = false;
1649 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1650 if (!SIInstrInfo::isVALU(I))
1651 return false;
1652 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1653 // Cover both WAR and WAW
1654 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1655 };
1656 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1657 if (WaitStates >= NoHazardWaitStates)
1658 return true;
1659 // Instructions which cause va_vdst==0 expire hazard
1662 };
1663 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1664 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1665 };
1666
1667 DenseSet<const MachineBasicBlock *> Visited;
1668 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1669 std::next(MI->getReverseIterator()), 0,
1670 IsExpiredFn, Visited, GetWaitStatesFn);
1671
1672 // Transcendentals can execute in parallel to other VALUs.
1673 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1674 if (VisitedTrans)
1675 Count = 0;
1676
1677 MachineOperand *WaitVdstOp =
1678 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1679 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1680
1681 return true;
1682}
1683
1684bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1686 return false;
1687
1688 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1689 const Register VDSTReg = VDST->getReg();
1690
1691 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1693 return false;
1694 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1695 };
1696 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1697 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1698 // according to the type of VMEM instruction.
1699 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1701 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1702 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1703 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1704 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1705 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1706 };
1707
1708 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1709 std::numeric_limits<int>::max())
1710 return false;
1711
1712 if (LdsdirCanWait) {
1713 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1714 } else {
1715 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1716 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1718 }
1719
1720 return true;
1721}
1722
1723bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1724 if (!ST.hasVALUPartialForwardingHazard())
1725 return false;
1726 assert(!ST.hasExtendedWaitCounts());
1727
1728 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1729 return false;
1730
1731 SmallSetVector<Register, 4> SrcVGPRs;
1732
1733 for (const MachineOperand &Use : MI->explicit_uses()) {
1734 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1735 SrcVGPRs.insert(Use.getReg());
1736 }
1737
1738 // Only applies with >= 2 unique VGPR sources
1739 if (SrcVGPRs.size() <= 1)
1740 return false;
1741
1742 // Look for the following pattern:
1743 // Va <- VALU [PreExecPos]
1744 // intv1
1745 // Exec <- SALU [ExecPos]
1746 // intv2
1747 // Vb <- VALU [PostExecPos]
1748 // intv3
1749 // MI Va, Vb (WaitState = 0)
1750 //
1751 // Where:
1752 // intv1 + intv2 <= 2 VALUs
1753 // intv3 <= 4 VALUs
1754 //
1755 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1756
1757 const int Intv1plus2MaxVALUs = 2;
1758 const int Intv3MaxVALUs = 4;
1759 const int IntvMaxVALUs = 6;
1760 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1761
1762 struct StateType {
1763 SmallDenseMap<Register, int, 4> DefPos;
1764 int ExecPos = std::numeric_limits<int>::max();
1765 int VALUs = 0;
1766
1767 static unsigned getHashValue(const StateType &State) {
1768 return hash_combine(State.ExecPos, State.VALUs,
1769 hash_combine_range(State.DefPos));
1770 }
1771 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1772 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1773 LHS.VALUs == RHS.VALUs;
1774 }
1775 };
1776
1777 StateType State;
1778
1779 // This overloads expiry testing with all the hazard detection
1780 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1781 // Too many VALU states have passed
1782 if (State.VALUs > NoHazardVALUWaitStates)
1783 return HazardExpired;
1784
1785 // Instructions which cause va_vdst==0 expire hazard
1788 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1789 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1790 return HazardExpired;
1791
1792 // Track registers writes
1793 bool Changed = false;
1794 if (SIInstrInfo::isVALU(I)) {
1795 for (Register Src : SrcVGPRs) {
1796 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1797 State.DefPos[Src] = State.VALUs;
1798 Changed = true;
1799 }
1800 }
1801 } else if (SIInstrInfo::isSALU(I)) {
1802 if (State.ExecPos == std::numeric_limits<int>::max()) {
1803 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1804 State.ExecPos = State.VALUs;
1805 Changed = true;
1806 }
1807 }
1808 }
1809
1810 // Early expiration: too many VALUs in intv3
1811 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1812 return HazardExpired;
1813
1814 // Only evaluate state if something changed
1815 if (!Changed)
1816 return NoHazardFound;
1817
1818 // Determine positions of VALUs pre/post exec change
1819 if (State.ExecPos == std::numeric_limits<int>::max())
1820 return NoHazardFound;
1821
1822 int PreExecPos = std::numeric_limits<int>::max();
1823 int PostExecPos = std::numeric_limits<int>::max();
1824
1825 for (auto Entry : State.DefPos) {
1826 int DefVALUs = Entry.second;
1827 if (DefVALUs != std::numeric_limits<int>::max()) {
1828 if (DefVALUs >= State.ExecPos)
1829 PreExecPos = std::min(PreExecPos, DefVALUs);
1830 else
1831 PostExecPos = std::min(PostExecPos, DefVALUs);
1832 }
1833 }
1834
1835 // Need a VALUs post exec change
1836 if (PostExecPos == std::numeric_limits<int>::max())
1837 return NoHazardFound;
1838
1839 // Too many VALUs in intv3?
1840 int Intv3VALUs = PostExecPos;
1841 if (Intv3VALUs > Intv3MaxVALUs)
1842 return HazardExpired;
1843
1844 // Too many VALUs in intv2?
1845 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1846 if (Intv2VALUs > Intv1plus2MaxVALUs)
1847 return HazardExpired;
1848
1849 // Need a VALUs pre exec change
1850 if (PreExecPos == std::numeric_limits<int>::max())
1851 return NoHazardFound;
1852
1853 // Too many VALUs in intv1?
1854 int Intv1VALUs = PreExecPos - State.ExecPos;
1855 if (Intv1VALUs > Intv1plus2MaxVALUs)
1856 return HazardExpired;
1857
1858 // Too many VALUs in intv1 + intv2
1859 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1860 return HazardExpired;
1861
1862 return HazardFound;
1863 };
1864 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1866 State.VALUs += 1;
1867 };
1868
1869 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1870 std::next(MI->getReverseIterator())))
1871 return false;
1872
1873 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1874 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1876
1877 return true;
1878}
1879
1880bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1881 if (!ST.hasVALUTransUseHazard())
1882 return false;
1883 assert(!ST.hasExtendedWaitCounts());
1884
1885 if (!SIInstrInfo::isVALU(*MI))
1886 return false;
1887
1888 SmallSet<Register, 4> SrcVGPRs;
1889
1890 for (const MachineOperand &Use : MI->explicit_uses()) {
1891 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1892 SrcVGPRs.insert(Use.getReg());
1893 }
1894
1895 // Look for the following pattern:
1896 // Va <- TRANS VALU
1897 // intv
1898 // MI Va (WaitState = 0)
1899 //
1900 // Where:
1901 // intv <= 5 VALUs / 1 TRANS
1902 //
1903 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1904
1905 const int IntvMaxVALUs = 5;
1906 const int IntvMaxTRANS = 1;
1907
1908 struct StateType {
1909 int VALUs = 0;
1910 int TRANS = 0;
1911
1912 static unsigned getHashValue(const StateType &State) {
1913 return hash_combine(State.VALUs, State.TRANS);
1914 }
1915 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1916 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1917 }
1918 };
1919
1920 StateType State;
1921
1922 // This overloads expiry testing with all the hazard detection
1923 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1924 // Too many VALU states have passed
1925 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1926 return HazardExpired;
1927
1928 // Instructions which cause va_vdst==0 expire hazard
1931 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1932 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1933 return HazardExpired;
1934
1935 // Track registers writes
1936 if (SIInstrInfo::isTRANS(I)) {
1937 for (Register Src : SrcVGPRs) {
1938 if (I.modifiesRegister(Src, &TRI)) {
1939 return HazardFound;
1940 }
1941 }
1942 }
1943
1944 return NoHazardFound;
1945 };
1946 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1948 State.VALUs += 1;
1950 State.TRANS += 1;
1951 };
1952
1953 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1954 std::next(MI->getReverseIterator())))
1955 return false;
1956
1957 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1958 // avoided.
1959 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1960 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1962
1963 return true;
1964}
1965
1966bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1967 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1969 return false;
1970
1971 const SIInstrInfo *TII = ST.getInstrInfo();
1972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1973
1974 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1975 if (!SIInstrInfo::isTRANS(I))
1976 return false;
1977
1978 // RAW: Trans(I) writes, VALU(MI) reads.
1979 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1980 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1981 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1982 return true;
1983 }
1984
1985 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1986 if (!ValuDst || !ValuDst->isReg())
1987 return false;
1988
1989 // WAR: Trans(I) reads, VALU(MI) writes.
1990 Register ValuDef = ValuDst->getReg();
1991 for (const MachineOperand &TransUse : I.explicit_uses()) {
1992 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1993 return true;
1994 }
1995
1996 return false;
1997 };
1998
1999 auto IsExpiredFn = [](const MachineInstr &I, int) {
2000 return SIInstrInfo::isVALU(I);
2001 };
2002
2003 const int HasVALU = std::numeric_limits<int>::max();
2004 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
2005 return false;
2006
2007 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2008 return true;
2009}
2010
2011bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
2013 return false;
2014
2015 const SIInstrInfo *TII = ST.getInstrInfo();
2016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2017
2018 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2020 return false;
2021
2022 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2023 // with the dest(matrix D) of the previous wmma.
2024 const Register CurSrc0Reg =
2025 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2026 const Register CurSrc1Reg =
2027 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2028
2029 const Register PrevDstReg =
2030 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2031
2032 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2033 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2034 return true;
2035 }
2036
2037 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2038 // but Index can't overlap with PrevDstReg.
2039 if (AMDGPU::isGFX12Plus(ST)) {
2040 if (SIInstrInfo::isSWMMAC(*MI)) {
2041 const Register CurIndex =
2042 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2043 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2044 return true;
2045 }
2046 return false;
2047 }
2048
2049 return false;
2050 };
2051
2052 auto IsExpiredFn = [](const MachineInstr &I, int) {
2053 return SIInstrInfo::isVALU(I);
2054 };
2055
2056 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2057 std::numeric_limits<int>::max())
2058 return false;
2059
2060 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2061
2062 return true;
2063}
2064
2069
2070// Classify XDL WMMA instructions into co-execution hazard categories
2071// (Refer to SPG 4.6.12.1), mainly based on instruction latency.
2072//
2073// Category 0: WMMA with Latency 8
2074// WMMA_*F16, WMMA_*BF16
2075// WMMA_*FP8FP8
2076// WMMA_*FP8BF8
2077// WMMA_*BF8FP8
2078// WMMA_*BF8BF8
2079// WMMA_*F8F6F4 if SRCA & SRCB != F8
2080//
2081// Category 1: WMMA Latency 16
2082// WMMA_IU8
2083// WMMA_*F8F6F4 if SRCA OR SRCB == F8
2084//
2085// Category 2: SWMMAC with Latency 8
2086// SWMMAC_*F16, SWMMAC_*BF16,
2087// SWMMAC_*FP8FP8
2088// SWMMAC_*BF8FP8
2089// SWMMAC_*FP8BF8
2090// SWMMAC_*BF8BF8
2091//
2092// Category 3: SWMMAC with Latency 16
2093// SWMMAC_IU8
2094//
2095// Category 4: 16 Pass GFX1251 WMMA with latency 16
2096// V_WMMA_*_16X16X32_{F16,BF16}
2097// V_WMMA_{F32,F16}_16X16X64_{FP8,BF8}*
2098// V_WMMA_F32_16x16x128_F8F6F4 (F4 only)
2099// V_SWMMAC_*_16X16X64_{F16,BF16}
2100// V_SWMMAC_{F32,F16}_16X16X128_{FP8,BF8}*
2101//
2102// Category 5: 32 Pass GFX1251 WMMA with latency 32
2103// V_WMMA_F32_16x16x128_F8F6F4 (not all F4)
2104// V_WMMA_{F32,F16}_16X16X128_{FP8,BF8}*
2105// V_WMMA_F32_32X16X128_F4
2106// V_WMMA_I32_16X16X64_IU8
2107// V_WMMA_I32_16X16X64_IU8
2109 const SIInstrInfo *TII,
2110 const TargetSchedModel &SchedModel,
2111 const GCNSubtarget &ST) {
2112 assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
2113 bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
2114 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2115 unsigned Category = 0;
2116
2117 unsigned Latency = SchedModel.computeInstrLatency(&MI);
2118 switch (Latency) {
2119 case 8:
2120 Category = IsSWMMAC ? 2 : 0;
2121 break;
2122 case 16:
2123 Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
2124 break;
2125 case 32:
2126 assert(IsLowestRateWMMA && "latency 32 is not expected");
2127 Category = 5;
2128 break;
2129 default:
2130 llvm_unreachable("unexpected xdl wmma latency");
2131 } // end switch.
2132
2133 return Category;
2134}
2135
2136int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2137 if (!ST.hasGFX1250Insts())
2138 return 0;
2139
2140 const SIInstrInfo *TII = ST.getInstrInfo();
2141 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2142 return 0;
2143
2144 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2145 // be in between the first WMMA and the second instruction to cover the hazard
2146 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2147 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2148 // numbers, which depends on the category of the first WMMA.
2149 const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
2150 const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
2151 unsigned Category = 0;
2152
2153 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2154 if (!TII->isXDLWMMA(I))
2155 return false;
2156
2157 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
2158 return hasWMMAToWMMARegOverlap(I, *MI);
2159 };
2160
2161 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2162 if (!TII->isXDLWMMA(I))
2163 return false;
2164
2165 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
2166 return hasWMMAToVALURegOverlap(I, *MI);
2167 };
2168
2169 auto GetWaitStatesFn = [](const MachineInstr &I) {
2170 return SIInstrInfo::isVALU(I) ? 1 : 0;
2171 };
2172
2173 int WaitStatesNeeded = -1;
2174 int ExistingVALUs = 0; // Existing number of VALU ops in between.
2175 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2176
2177 // getWaitStatesSince checks for a hazard between instruction 'I' and 'MI':
2178 // - If a hazard exists: returns the number of VALUs in between and sets
2179 // 'Category' via IsWMMAHazardFn/IsVALUHazardFn for instruction 'I'.
2180 // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
2181 // so no V_NOP insertion is needed.
2182 if (TII->isXDLWMMA(*MI)) {
2183 // Maximum of MMAWaitStates.
2184 const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
2185 ExistingVALUs =
2186 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2187 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2188 } else { // Must be a co-executable VALU.
2189 // Maximum of VALUWaitStates.
2190 const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
2191 ExistingVALUs =
2192 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2193 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2194 }
2195
2196 return WaitStatesNeeded;
2197}
2198
2199bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2200 const MachineInstr &WMMA, const MachineInstr &MI) const {
2201 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2202 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2203 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2204
2205 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2206 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2207 return true;
2208
2210 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2211 if (TRI.regsOverlap(D0, Idx1))
2212 return true;
2213 }
2214 return false;
2215}
2216
2217bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2218 const MachineInstr &WMMA, const MachineInstr &MI) const {
2219 // WMMA writes, VALU reads.
2220 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2221 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2222 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2223 return true;
2224 }
2225
2226 // WMMA reads or writes, VALU writes.
2227 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2228 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2229 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2230
2231 if (SIInstrInfo::isSWMMAC(WMMA)) {
2232 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2233 WMMARegs.push_back(Idx0);
2234 }
2235
2236 for (const MachineOperand &ValuDef : MI.defs()) {
2237 Register VDstReg = ValuDef.getReg();
2238 for (Register WMMAReg : WMMARegs) {
2239 if (TRI.regsOverlap(VDstReg, WMMAReg))
2240 return true;
2241 }
2242 }
2243 return false;
2244}
2245
2246bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2247 const MachineInstr &MI) const {
2248 // I is the potential WMMA hazard source, MI is the instruction being checked
2249 // for hazard.
2250 if (!TII.isXDLWMMA(I))
2251 return false;
2252
2253 // Dispatch based on MI type
2254 if (TII.isXDLWMMA(MI))
2255 return hasWMMAToWMMARegOverlap(I, MI);
2257 return hasWMMAToVALURegOverlap(I, MI);
2258
2259 return false;
2260}
2261
2262bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2263 bool IncludeSubloops) {
2264 // Scan loop for any WMMA that hazards MI.
2265 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2266 for (MachineBasicBlock *MBB : L->getBlocks()) {
2267 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2268 continue;
2269 for (MachineInstr &I : *MBB) {
2270 if (&I == MI)
2271 continue;
2272 if (isCoexecutionHazardFor(I, *MI))
2273 return true;
2274 }
2275 }
2276 return false;
2277}
2278
2279bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2280 int WaitStatesNeeded) {
2281 if (!MLI)
2282 return false;
2283
2284 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2285 if (!L) {
2286 ++NumWMMAHoistingBailed;
2287 return false;
2288 }
2289
2290 // If innermost loop has WMMA hazard, we can't hoist at all
2291 if (hasWMMAHazardInLoop(L, MI)) {
2292 ++NumWMMAHoistingBailed;
2293 return false;
2294 }
2295
2296 // Find outermost loop with no internal hazard
2297 MachineLoop *TargetLoop = L;
2298 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2299 if (hasWMMAHazardInLoop(Parent, MI, false))
2300 break; // Parent has hazard in its own blocks, stop here
2301 TargetLoop = Parent; // Safe to hoist further out
2302 }
2303
2304 // Need valid preheader to insert V_NOPs
2305 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2306 if (!Preheader) {
2307 ++NumWMMAHoistingBailed;
2308 return false;
2309 }
2310
2311 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2312 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2313 << "\n");
2314
2315 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2316 /*IsHoisting=*/true);
2317 NumWMMANopsHoisted += WaitStatesNeeded;
2318 return true;
2319}
2320
2321bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2322 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2323 if (WaitStatesNeeded <= 0)
2324 return false;
2325
2326 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2327 return true;
2328
2329 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2330 return true;
2331}
2332
2333bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2334 if (!ST.hasShift64HighRegBug())
2335 return false;
2336 assert(!ST.hasExtendedWaitCounts());
2337
2338 switch (MI->getOpcode()) {
2339 default:
2340 return false;
2341 case AMDGPU::V_LSHLREV_B64_e64:
2342 case AMDGPU::V_LSHRREV_B64_e64:
2343 case AMDGPU::V_ASHRREV_I64_e64:
2344 break;
2345 }
2346
2347 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2348 if (!Amt->isReg())
2349 return false;
2350
2351 Register AmtReg = Amt->getReg();
2352 const MachineRegisterInfo &MRI = MF.getRegInfo();
2353 // Check if this is a last VGPR in the allocation block.
2354 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2355 return false;
2356
2357 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2358 return false;
2359
2360 assert(ST.needsAlignedVGPRs());
2361 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2362
2363 const DebugLoc &DL = MI->getDebugLoc();
2364 MachineBasicBlock *MBB = MI->getParent();
2365 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2366
2367 // In:
2368 //
2369 // Dst = shiftrev64 Amt, Src1
2370 //
2371 // if Dst!=Src1 then avoid the bug with:
2372 //
2373 // Dst.sub0 = Amt
2374 // Dst = shift64 Dst.sub0, Src1
2375
2376 Register DstReg = MI->getOperand(0).getReg();
2377 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2378 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2379 runOnInstruction(
2380 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2381 Amt->setReg(DstLo);
2382 Amt->setIsKill(true);
2383 return true;
2384 }
2385
2386 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2387 Register NewReg;
2388 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2389 : AMDGPU::VGPR_32RegClass) {
2390 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2391 NewReg = Reg;
2392 break;
2393 }
2394 }
2395
2396 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2397 : NewReg;
2398 Register NewAmtLo;
2399
2400 if (Overlapped)
2401 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2402
2403 // Insert a full wait count because found register might be pending a wait.
2404 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2405 .addImm(0);
2406
2407 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2408 if (Overlapped)
2409 runOnInstruction(
2410 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2411 .addDef(AmtReg - 1)
2412 .addReg(AmtReg - 1, RegState::Undef)
2413 .addReg(NewAmtLo, RegState::Undef));
2414 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2415 .addDef(AmtReg)
2416 .addReg(AmtReg, RegState::Undef)
2417 .addReg(NewAmt, RegState::Undef));
2418
2419 // Instructions emitted after the current instruction will be processed by the
2420 // parent loop of the hazard recognizer in a natural way.
2421 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2422 AmtReg)
2423 .addDef(NewAmt)
2424 .addReg(NewAmt)
2425 .addReg(AmtReg);
2426 if (Overlapped)
2427 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2428 AmtReg - 1)
2429 .addDef(NewAmtLo)
2430 .addReg(NewAmtLo)
2431 .addReg(AmtReg - 1);
2432
2433 // Re-running hazard recognizer on the modified instruction is not necessary,
2434 // inserted V_SWAP_B32 has already both read and write new registers so
2435 // hazards related to these register has already been handled.
2436 Amt->setReg(NewAmt);
2437 Amt->setIsKill(false);
2438 // We do not update liveness, so verifier may see it as undef.
2439 Amt->setIsUndef();
2440 if (Overlapped) {
2441 MI->getOperand(0).setReg(NewReg);
2442 Src1->setReg(NewReg);
2443 Src1->setIsKill(false);
2444 Src1->setIsUndef();
2445 }
2446
2447 return true;
2448}
2449
2450int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2451 int NSAtoVMEMWaitStates = 1;
2452
2453 if (!ST.hasNSAtoVMEMBug())
2454 return 0;
2455
2457 return 0;
2458
2459 const SIInstrInfo *TII = ST.getInstrInfo();
2460 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2461 if (!Offset || (Offset->getImm() & 6) == 0)
2462 return 0;
2463
2464 auto IsHazardFn = [TII](const MachineInstr &I) {
2465 if (!SIInstrInfo::isMIMG(I))
2466 return false;
2467 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2468 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2469 TII->getInstSizeInBytes(I) >= 16;
2470 };
2471
2472 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2473}
2474
2475int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2476 MachineInstr *MI) const {
2477 int FPAtomicToDenormModeWaitStates = 3;
2478
2479 if (!ST.hasFPAtomicToDenormModeHazard())
2480 return 0;
2481 assert(!ST.hasExtendedWaitCounts());
2482
2483 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2484 return 0;
2485
2486 auto IsHazardFn = [](const MachineInstr &I) {
2487 if (!SIInstrInfo::isVMEM(I))
2488 return false;
2489 return SIInstrInfo::isFPAtomic(I);
2490 };
2491
2492 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2493 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2494 return true;
2495
2496 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2497 };
2498
2499 return FPAtomicToDenormModeWaitStates -
2500 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2501}
2502
2503int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2505
2506 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2507}
2508
2509int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2510 // Early exit if no padding is requested.
2511 if (MFMAPaddingRatio == 0)
2512 return 0;
2513
2514 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2515 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2516 return 0;
2517
2518 int NeighborMFMALatency = 0;
2519 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2520 this](const MachineInstr &MI) {
2521 if (!SIInstrInfo::isMFMA(MI))
2522 return false;
2523
2524 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2525 return true;
2526 };
2527
2528 const int MaxMFMAPipelineWaitStates = 16;
2529 int WaitStatesSinceNeighborMFMA =
2530 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2531
2532 int NeighborMFMAPaddingNeeded =
2533 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2534 WaitStatesSinceNeighborMFMA;
2535
2536 return std::max(0, NeighborMFMAPaddingNeeded);
2537}
2538
2539int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2540 int WaitStatesNeeded = 0;
2541 unsigned Opc = MI->getOpcode();
2542
2543 auto IsVALUFn = [](const MachineInstr &MI) {
2544 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2545 };
2546
2547 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2548 const int LegacyVALUWritesVGPRWaitStates = 2;
2549 const int VALUWritesExecWaitStates = 4;
2550 const int MaxWaitStates = 4;
2551
2552 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2553 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2554 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2555
2556 if (WaitStatesNeeded < MaxWaitStates) {
2557 for (const MachineOperand &Use : MI->explicit_uses()) {
2558 const int MaxWaitStates = 2;
2559
2560 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2561 continue;
2562
2563 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2564 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2565 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2566
2567 if (WaitStatesNeeded == MaxWaitStates)
2568 break;
2569 }
2570 }
2571 }
2572
2573 for (const MachineOperand &Op : MI->explicit_operands()) {
2574 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2575 continue;
2576
2577 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2578 continue;
2579
2580 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2581 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2582 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2583 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2584 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2585 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2586 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2587 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2588 const int MaxWaitStates = 18;
2589 Register Reg = Op.getReg();
2590 unsigned HazardDefLatency = 0;
2591
2592 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2593 this](const MachineInstr &MI) {
2594 if (!SIInstrInfo::isMFMA(MI))
2595 return false;
2596 Register DstReg = MI.getOperand(0).getReg();
2597 if (DstReg == Reg)
2598 return false;
2599 HazardDefLatency =
2600 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2601 return TRI.regsOverlap(DstReg, Reg);
2602 };
2603
2604 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2605 MaxWaitStates);
2606 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2607 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2608 int OpNo = Op.getOperandNo();
2609 if (OpNo == SrcCIdx) {
2610 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2611 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2612 switch (HazardDefLatency) {
2613 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2614 break;
2615 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2616 break;
2617 case 16: [[fallthrough]];
2618 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2619 break;
2620 }
2621 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2622 switch (HazardDefLatency) {
2623 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2624 break;
2625 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2626 break;
2627 case 16: [[fallthrough]];
2628 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2629 break;
2630 }
2631 }
2632
2633 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2635
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 return WaitStatesNeeded; // Early exit.
2638
2639 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2640 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2641 return false;
2642 Register DstReg = MI.getOperand(0).getReg();
2643 return TRI.regsOverlap(Reg, DstReg);
2644 };
2645
2646 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2647 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2648 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2649 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2650 if (OpNo == SrcCIdx)
2651 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2652 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2653 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2654
2655 WaitStatesNeededForUse = NeedWaitStates -
2656 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2658
2659 if (WaitStatesNeeded == MaxWaitStates)
2660 return WaitStatesNeeded; // Early exit.
2661 }
2662
2663 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2664 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2665 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2666 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2667 const int MaxWaitStates = 13;
2668 Register DstReg = MI->getOperand(0).getReg();
2669 unsigned HazardDefLatency = 0;
2670
2671 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2672 this](const MachineInstr &MI) {
2673 if (!SIInstrInfo::isMFMA(MI))
2674 return false;
2675 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2676 HazardDefLatency =
2677 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2678 return TRI.regsOverlap(Reg, DstReg);
2679 };
2680
2681 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2682 int NeedWaitStates;
2683 switch (HazardDefLatency) {
2684 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2685 break;
2686 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2687 break;
2688 case 16: [[fallthrough]];
2689 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2690 break;
2691 }
2692
2693 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2694 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2695 }
2696
2697 // Pad neighboring MFMA with noops for better inter-wave performance.
2698 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2699
2700 return WaitStatesNeeded;
2701}
2702
2703static int
2705 bool IsGFX950) {
2706 // xdl def cycles | gfx940 | gfx950
2707 // 2 pass | 3 4
2708 // 4 pass | 5 6
2709 // 8 pass | 9 10
2710 // 16 pass | 17 18
2711 return NumPasses + 1 + IsGFX950;
2712}
2713
2714static int
2716 bool IsGFX950) {
2717 // xdl def cycles | gfx940 | gfx950
2718 // 2 pass | 3 3
2719 // 4 pass | 5 6
2720 // 8 pass | 9 10
2721 // 16 pass | 17 18
2722 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2723}
2724
2725static int
2727 // 2 pass -> 2
2728 // 4 pass -> 4
2729 // 8 pass -> 8
2730 // 16 pass -> 16
2731 return NumPasses;
2732}
2733
2734static int
2736 // 2 pass -> 4
2737 // 4 pass -> 6
2738 // 8 pass -> 10
2739 // 16 pass -> 18
2740 return NumPasses + 2;
2741}
2742
2744 bool IsGFX950) {
2745 // xdl def cycles | gfx942 | gfx950
2746 // 2 pass | 5 5
2747 // 4 pass | 7 8
2748 // 8 pass | 11 12
2749 // 16 pass | 19 20
2750 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2751}
2752
2753int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2754 int WaitStatesNeeded = 0;
2755 unsigned Opc = MI->getOpcode();
2756
2757 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2759 };
2760
2761 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2764 };
2765
2766 if (!SIInstrInfo::isMFMA(*MI))
2767 return WaitStatesNeeded;
2768
2769 const int VALUWritesExecWaitStates = 4;
2770 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2771 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2772 VALUWritesExecWaitStates);
2773 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2774
2775 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2776
2777 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2778 for (const MachineOperand &Use : MI->explicit_uses()) {
2779 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2780 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2781 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2782 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2783 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2784 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2785 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2786 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2787 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2788 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2789 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2790 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2791 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2792 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2793 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2794 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2795 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2796 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2797 const int MaxWaitStates = 19;
2798
2799 if (!Use.isReg())
2800 continue;
2801 Register Reg = Use.getReg();
2802 bool FullReg;
2803 const MachineInstr *MI1;
2804
2805 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2806 this](const MachineInstr &MI) {
2807 if (!SIInstrInfo::isMFMA(MI))
2808 return false;
2809 Register DstReg = MI.getOperand(0).getReg();
2810 FullReg = (DstReg == Reg);
2811 MI1 = &MI;
2812 return TRI.regsOverlap(DstReg, Reg);
2813 };
2814
2815 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2816 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2817 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2818
2819 int NumWaitStates =
2820 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2821 if (NumWaitStates == std::numeric_limits<int>::max())
2822 continue;
2823
2824 int OpNo = Use.getOperandNo();
2825 unsigned Opc1 = MI1->getOpcode();
2826 int NeedWaitStates = 0;
2827 if (OpNo == SrcCIdx) {
2828 if (!SIInstrInfo::isDGEMM(Opc) &&
2829 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2830 NeedWaitStates = 0;
2831 } else if (FullReg) {
2832 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2833 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2834 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2835 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2836 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2837 else if (ST.hasGFX940Insts() &&
2838 TSchedModel.computeInstrLatency(MI1) == 2)
2839 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2840 } else {
2841 switch (Opc1) {
2842 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2843 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2844 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2845 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2846 if (!TII.isXDL(*MI))
2847 NeedWaitStates =
2848 ST.hasGFX950Insts()
2849 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2850 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2851 break;
2852 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2853 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2854 if (!TII.isXDL(*MI))
2855 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2856 break;
2857 default:
2858 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2859 if (ST.hasGFX940Insts()) {
2860 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2861 break;
2862
2863 NeedWaitStates =
2864 TII.isXDL(*MI1)
2865 ? (TII.isXDL(*MI)
2867 NumPasses, ST.hasGFX950Insts())
2869 NumPasses, ST.hasGFX950Insts()))
2871 NumPasses);
2872 break;
2873 }
2874
2875 switch (NumPasses) {
2876 case 2:
2877 NeedWaitStates =
2879 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2880 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2881 break;
2882 case 8:
2883 NeedWaitStates =
2885 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2886 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2887 break;
2888 case 16:
2889 NeedWaitStates =
2891 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2892 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2893 break;
2894 default:
2895 llvm_unreachable("unexpected number of passes");
2896 }
2897 }
2898 }
2899 } else {
2900 switch (Opc1) {
2901 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2902 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2903 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2904 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2905 NeedWaitStates =
2906 ST.hasGFX950Insts()
2907 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2908 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2909 break;
2910 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2911 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2912 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2913 break;
2914 default:
2915 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2916
2917 if (ST.hasGFX940Insts()) {
2918 NeedWaitStates =
2919 TII.isXDL(*MI1)
2921 NumPasses, ST.hasGFX950Insts())
2923 NumPasses);
2924 break;
2925 }
2926
2927 switch (NumPasses) {
2928 case 2:
2929 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2930 break;
2931 case 4:
2932 llvm_unreachable("unexpected number of passes for mfma");
2933 case 8:
2934 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2935 break;
2936 case 16:
2937 default:
2938 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2939 }
2940 }
2941 }
2942 if (WaitStatesNeeded >= NeedWaitStates)
2943 continue;
2944
2945 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2946 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2947
2948 if (WaitStatesNeeded == MaxWaitStates)
2949 break;
2950 }
2951
2952 // Pad neighboring MFMA with noops for better inter-wave performance.
2953 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2954
2955 return WaitStatesNeeded;
2956}
2957
2958int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2959 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2960 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2961 return 0;
2962
2963 int WaitStatesNeeded = 0;
2964
2965 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2966 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2967 };
2968
2969 for (const MachineOperand &Op : MI->explicit_uses()) {
2970 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2971 continue;
2972
2973 Register Reg = Op.getReg();
2974
2975 const int AccVgprReadLdStWaitStates = 2;
2976 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2977 const int MaxWaitStates = 2;
2978
2979 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2980 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2981 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2982
2983 if (WaitStatesNeeded == MaxWaitStates)
2984 return WaitStatesNeeded; // Early exit.
2985
2986 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2987 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2988 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2989 return false;
2990 auto IsVALUFn = [](const MachineInstr &MI) {
2992 };
2993 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2994 std::numeric_limits<int>::max();
2995 };
2996
2997 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2998 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3000 }
3001
3002 return WaitStatesNeeded;
3003}
3004
3005int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
3006 assert(!ST.hasVcmpxPermlaneHazard() &&
3007 "this is a different vcmpx+permlane hazard");
3008 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3009 const SIInstrInfo *TII = ST.getInstrInfo();
3010
3011 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
3012 return isVCmpXWritesExec(*TII, *TRI, MI);
3013 };
3014
3015 auto IsVALUFn = [](const MachineInstr &MI) {
3016 return SIInstrInfo::isVALU(MI);
3017 };
3018
3019 const int VCmpXWritesExecWaitStates = 4;
3020 const int VALUWritesVDstWaitStates = 2;
3021 int WaitStatesNeeded = 0;
3022
3023 for (const MachineOperand &Op : MI->explicit_uses()) {
3024 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
3025 continue;
3026 Register Reg = Op.getReg();
3027
3028 int WaitStatesSinceDef =
3029 VALUWritesVDstWaitStates -
3030 getWaitStatesSinceDef(Reg, IsVALUFn,
3031 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
3032 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3033 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3034 break;
3035 }
3036
3037 int VCmpXHazardWaits =
3038 VCmpXWritesExecWaitStates -
3039 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3040
3041 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3042 return WaitStatesNeeded;
3043}
3044
3046 // 2 pass -> 4
3047 // 4 pass -> 6
3048 // 8 pass -> 10
3049 // 16 pass -> 18
3050 return NumPasses + 2;
3051}
3052
3054 bool IsGFX950) {
3055 // xdl def cycles | gfx942 | gfx950
3056 // 2 pass | 5 5
3057 // 4 pass | 7 8
3058 // 8 pass | 11 12
3059 // 16 pass | 19 20
3060 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3061}
3062
3064 bool IsGFX950) {
3065 // xdl def cycles | gfx942 | gfx950
3066 // 2 pass | 5 5
3067 // 4 pass | 7 8
3068 // 8 pass | 11 12
3069 // 16 pass | 19 20
3070 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3071}
3072
3074 // 2 pass -> 4
3075 // 4 pass -> 6
3076 // 8 pass -> 10
3077 // 16 pass -> 18
3078 return NumPasses + 2;
3079}
3080
3081int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3082 if (!ST.hasGFX90AInsts())
3083 return 0;
3084
3085 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3086 return SIInstrInfo::isDGEMM(MI.getOpcode());
3087 };
3088
3089 // This is checked in checkMAIHazards90A()
3090 if (SIInstrInfo::isMFMA(*MI))
3091 return 0;
3092
3093 const MachineRegisterInfo &MRI = MF.getRegInfo();
3094
3095 int WaitStatesNeeded = 0;
3096
3097 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3098 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3099 bool IsVALU = SIInstrInfo::isVALU(*MI);
3100
3101 const MachineInstr *MFMA = nullptr;
3102 unsigned Reg;
3103 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3104 if (!SIInstrInfo::isMFMA(MI) ||
3105 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3106 return false;
3107 MFMA = &MI;
3108 return true;
3109 };
3110
3111 const MachineInstr *DOT = nullptr;
3112 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3113 if (!SIInstrInfo::isDOT(MI) ||
3114 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3115 return false;
3116 DOT = &MI;
3117 return true;
3118 };
3119
3120 bool DGEMMAfterVALUWrite = false;
3121 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3122 // Found DGEMM on reverse traversal to def.
3123 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3124 DGEMMAfterVALUWrite = true;
3125
3126 // Only hazard if register is defined by a VALU and a DGEMM is found after
3127 // after the def.
3128 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3129 return false;
3130
3131 return true;
3132 };
3133
3134 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3135 AMDGPU::OpName::src2);
3136
3137 if (IsMemOrExport || IsVALU) {
3138 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3139 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3140 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3141 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3142 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3143 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3144 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3145 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3146 const int DotWriteSameDotReadSrcAB = 3;
3147 const int DotWriteDifferentVALURead = 3;
3148 const int DMFMABetweenVALUWriteVMEMRead = 2;
3149 const int MaxWaitStates = 19;
3150
3151 for (const MachineOperand &Use : MI->explicit_uses()) {
3152 if (!Use.isReg())
3153 continue;
3154 Reg = Use.getReg();
3155
3156 DOT = nullptr;
3157 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3158 MaxWaitStates);
3159 if (DOT) {
3160 int NeedWaitStates = 0;
3161 if (DOT->getOpcode() == MI->getOpcode()) {
3162 if (&Use - &MI->getOperand(0) != SrcCIdx)
3163 NeedWaitStates = DotWriteSameDotReadSrcAB;
3164 } else {
3165 NeedWaitStates = DotWriteDifferentVALURead;
3166 }
3167
3168 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3169 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3170 }
3171
3172 // Workaround for HW data hazard bug observed only in GFX90A. When there
3173 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3174 // causes the SQ to incorrectly not insert two wait states between the two
3175 // instructions needed to avoid data hazard.
3176 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3177 DGEMMAfterVALUWrite = false;
3178 if (TRI.isVectorRegister(MRI, Reg)) {
3179 int WaitStatesNeededForUse =
3180 DMFMABetweenVALUWriteVMEMRead -
3181 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3182 DMFMABetweenVALUWriteVMEMRead);
3183
3184 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3185 }
3186 }
3187
3188 MFMA = nullptr;
3189 WaitStatesSinceDef =
3190 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3191 if (!MFMA)
3192 continue;
3193
3194 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3195 int NumPasses = HazardDefLatency;
3196 int NeedWaitStates = MaxWaitStates;
3197
3198 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3199 switch (HazardDefLatency) {
3200 case 4:
3201 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3202 : DMFMA4x4WriteVgprVALUReadWaitStates;
3203 break;
3204 case 8:
3205 case 16:
3206 NeedWaitStates =
3207 IsMemOrExport
3208 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3209 : (ST.hasGFX950Insts()
3210 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3211 : DMFMA16x16WriteVgprVALUReadWaitStates);
3212 break;
3213 default:
3214 llvm_unreachable("unexpected dgemm");
3215 }
3216 } else if (ST.hasGFX940Insts()) {
3217 NeedWaitStates =
3218 TII.isXDL(*MFMA)
3220 NumPasses, ST.hasGFX950Insts())
3222 NumPasses);
3223 } else {
3224 switch (HazardDefLatency) {
3225 case 2:
3226 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3227 break;
3228 case 8:
3229 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3230 break;
3231 case 16:
3232 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3233 break;
3234 default:
3235 llvm_unreachable("unexpected number of passes for mfma");
3236 }
3237 }
3238
3239 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3241
3242 if (WaitStatesNeeded == MaxWaitStates)
3243 break;
3244 }
3245 }
3246
3247 unsigned Opc = MI->getOpcode();
3248 const int DMFMAToFMA64WaitStates = 2;
3249 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3250 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3251 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3252 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3253 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3254 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3255 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3256 }
3257
3258 if (!IsVALU && !IsMemOrExport)
3259 return WaitStatesNeeded;
3260
3261 for (const MachineOperand &Def : MI->defs()) {
3262 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3263 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3264 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3265 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3266 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3267 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3268 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3269 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3270 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3271 const int DotWriteDifferentVALUWrite = 3;
3272 const int MaxWaitStates = 19;
3273 const int MaxWarWaitStates = 15;
3274
3275 Reg = Def.getReg();
3276
3277 DOT = nullptr;
3278 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3279 MaxWaitStates);
3280 if (DOT && DOT->getOpcode() != MI->getOpcode())
3281 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3282 WaitStatesSinceDef);
3283
3284 MFMA = nullptr;
3285 WaitStatesSinceDef =
3286 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3287 if (MFMA) {
3288 int NeedWaitStates = MaxWaitStates;
3289 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3290
3291 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3292 switch (NumPasses) {
3293 case 4:
3294 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3295 break;
3296 case 8:
3297 case 16:
3298 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3299 break;
3300 default:
3301 llvm_unreachable("unexpected number of cycles for dgemm");
3302 }
3303 } else if (ST.hasGFX940Insts()) {
3304 NeedWaitStates =
3305 TII.isXDL(*MFMA)
3307 NumPasses, ST.hasGFX950Insts())
3309 } else {
3310 switch (NumPasses) {
3311 case 2:
3312 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3313 break;
3314 case 8:
3315 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3316 break;
3317 case 16:
3318 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3319 break;
3320 default:
3321 llvm_unreachable("Unexpected number of passes for mfma");
3322 }
3323 }
3324
3325 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3326 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3327
3328 if (WaitStatesNeeded == MaxWaitStates)
3329 break;
3330 }
3331
3332 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3333 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3334 !MI.readsRegister(Reg, &TRI))
3335 return false;
3336
3337 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3338 return false;
3339
3340 const MachineOperand *SrcC =
3341 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3342 assert(SrcC);
3343 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3344 return false;
3345
3346 MFMA = &MI;
3347 return true;
3348 };
3349
3350 MFMA = nullptr;
3351 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3352 MaxWarWaitStates);
3353 if (!MFMA)
3354 continue;
3355
3356 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3357 int NeedWaitStates = MaxWaitStates;
3358 switch (HazardDefLatency) {
3359 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3360 break;
3361 case 4: assert(ST.hasGFX940Insts());
3362 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3363 break;
3364 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3365 break;
3366 case 16: [[fallthrough]];
3367 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3368 break;
3369 }
3370
3371 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3373 }
3374
3375 return WaitStatesNeeded;
3376}
3377
3379 if (!SU->isInstr())
3380 return false;
3381
3382 const MachineInstr *MAI = nullptr;
3383
3384 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3385 MAI = nullptr;
3387 MAI = &MI;
3388 return MAI != nullptr;
3389 };
3390
3391 MachineInstr *MI = SU->getInstr();
3392 if (IsMFMAFn(*MI)) {
3393 int W = getWaitStatesSince(IsMFMAFn, 16);
3394 if (MAI)
3395 return W < (int)TSchedModel.computeInstrLatency(MAI);
3396 }
3397
3398 return false;
3399}
3400
3401// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3402// insertion of a new instruction.
3403static void updateGetPCBundle(MachineInstr *NewMI) {
3404 if (!NewMI->isBundled())
3405 return;
3406
3407 // Find start of bundle.
3408 auto I = NewMI->getIterator();
3409 while (I->isBundledWithPred())
3410 I--;
3411 if (I->isBundle())
3412 I++;
3413
3414 // Bail if this is not an S_GETPC bundle.
3415 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3416 return;
3417
3418 // Update offsets of any references in the bundle.
3419 const unsigned NewBytes = 4;
3420 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3421 "Unexpected instruction insertion in bundle");
3422 auto NextMI = std::next(NewMI->getIterator());
3423 auto End = NewMI->getParent()->end();
3424 while (NextMI != End && NextMI->isBundledWithPred()) {
3425 for (auto &Operand : NextMI->operands()) {
3426 if (Operand.isGlobal())
3427 Operand.setOffset(Operand.getOffset() + NewBytes);
3428 }
3429 NextMI++;
3430 }
3431}
3432
3433bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3434 if (!ST.hasVALUMaskWriteHazard())
3435 return false;
3436 assert(!ST.hasExtendedWaitCounts());
3437
3438 if (!ST.isWave64())
3439 return false;
3440
3441 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3442 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3443 if (!IsSALU && !IsVALU)
3444 return false;
3445
3446 // The hazard sequence is three instructions:
3447 // 1. VALU reads SGPR as mask
3448 // 2. VALU/SALU writes SGPR
3449 // 3. VALU/SALU reads SGPR
3450 // The hazard can expire if the distance between 2 and 3 is sufficient,
3451 // or (2) is VALU and (3) is SALU.
3452 // In practice this happens <10% of the time, hence always assume the hazard
3453 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3454
3455 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3456 const MachineRegisterInfo &MRI = MF.getRegInfo();
3457
3458 auto IgnoreableSGPR = [](const Register Reg) {
3459 switch (Reg) {
3460 case AMDGPU::EXEC:
3461 case AMDGPU::EXEC_LO:
3462 case AMDGPU::EXEC_HI:
3463 case AMDGPU::M0:
3464 case AMDGPU::SGPR_NULL:
3465 case AMDGPU::SGPR_NULL64:
3466 case AMDGPU::SCC:
3467 return true;
3468 default:
3469 return false;
3470 }
3471 };
3472 auto IsVCC = [](const Register Reg) {
3473 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3474 };
3475
3476 struct StateType {
3477 SmallSet<Register, 2> HazardSGPRs;
3478
3479 static unsigned getHashValue(const StateType &State) {
3480 return hash_combine_range(State.HazardSGPRs);
3481 }
3482 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3483 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3484 }
3485 };
3486
3487 SmallVector<const MachineInstr *> WaitInstrs;
3488 bool HasSGPRRead = false;
3489 StateType InitialState;
3490
3491 // Look for SGPR write.
3492 MachineOperand *HazardDef = nullptr;
3493 for (MachineOperand &Op : MI->operands()) {
3494 if (!Op.isReg())
3495 continue;
3496 if (Op.isDef() && HazardDef)
3497 continue;
3498
3499 Register Reg = Op.getReg();
3500 if (IgnoreableSGPR(Reg))
3501 continue;
3502 if (!IsVCC(Reg)) {
3503 if (Op.isImplicit())
3504 continue;
3505 if (!TRI->isSGPRReg(MRI, Reg))
3506 continue;
3507 }
3508 // Also check for SGPR reads.
3509 if (Op.isUse()) {
3510 HasSGPRRead = true;
3511 continue;
3512 }
3513
3514 assert(!HazardDef);
3515 HazardDef = &Op;
3516 }
3517
3518 if (!HazardDef)
3519 return false;
3520
3521 // Setup to track writes to individual SGPRs
3522 const Register HazardReg = HazardDef->getReg();
3523 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3524 InitialState.HazardSGPRs.insert(HazardReg);
3525 } else {
3526 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3527 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3528 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3529 }
3530
3531 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3532 if (State.HazardSGPRs.empty())
3533 return HazardExpired;
3534
3535 switch (I.getOpcode()) {
3536 case AMDGPU::V_ADDC_U32_e32:
3537 case AMDGPU::V_ADDC_U32_dpp:
3538 case AMDGPU::V_CNDMASK_B16_t16_e32:
3539 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3540 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3541 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3542 case AMDGPU::V_CNDMASK_B32_e32:
3543 case AMDGPU::V_CNDMASK_B32_dpp:
3544 case AMDGPU::V_DIV_FMAS_F32_e64:
3545 case AMDGPU::V_DIV_FMAS_F64_e64:
3546 case AMDGPU::V_SUBB_U32_e32:
3547 case AMDGPU::V_SUBB_U32_dpp:
3548 case AMDGPU::V_SUBBREV_U32_e32:
3549 case AMDGPU::V_SUBBREV_U32_dpp: {
3550 // These implicitly read VCC as mask source.
3551 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3552 }
3553 case AMDGPU::V_ADDC_U32_e64:
3554 case AMDGPU::V_ADDC_U32_e64_dpp:
3555 case AMDGPU::V_CNDMASK_B16_t16_e64:
3556 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3557 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3558 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3559 case AMDGPU::V_CNDMASK_B32_e64:
3560 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3561 case AMDGPU::V_SUBB_U32_e64:
3562 case AMDGPU::V_SUBB_U32_e64_dpp:
3563 case AMDGPU::V_SUBBREV_U32_e64:
3564 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3565 // Only check mask register overlaps.
3566 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3567 assert(SSRCOp);
3568 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3569 return Result ? HazardFound : NoHazardFound;
3570 }
3571 default:
3572 return NoHazardFound;
3573 }
3574 };
3575
3576 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3578 0),
3579 0);
3580 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3581 switch (I.getOpcode()) {
3582 case AMDGPU::S_WAITCNT_DEPCTR:
3583 // Record mergable waits within region of instructions free of SGPR reads.
3584 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3585 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3586 WaitInstrs.push_back(&I);
3587 break;
3588 default:
3589 // Update tracking of SGPR reads and writes.
3590 for (auto &Op : I.operands()) {
3591 if (!Op.isReg())
3592 continue;
3593
3594 Register Reg = Op.getReg();
3595 if (IgnoreableSGPR(Reg))
3596 continue;
3597 if (!IsVCC(Reg)) {
3598 if (Op.isImplicit())
3599 continue;
3600 if (!TRI->isSGPRReg(MRI, Reg))
3601 continue;
3602 }
3603 if (Op.isUse()) {
3604 HasSGPRRead = true;
3605 continue;
3606 }
3607
3608 // Stop tracking any SGPRs with writes on the basis that they will
3609 // already have an appropriate wait inserted afterwards.
3611 for (Register SGPR : State.HazardSGPRs) {
3612 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3613 Found.push_back(SGPR);
3614 }
3615 for (Register SGPR : Found)
3616 State.HazardSGPRs.erase(SGPR);
3617 }
3618 break;
3619 }
3620 };
3621
3622 // Check for hazard
3623 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3624 MI->getParent(),
3625 std::next(MI->getReverseIterator())))
3626 return false;
3627
3628 // Compute counter mask
3629 unsigned DepCtr =
3630 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3631 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3632 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3633
3634 // Try to merge previous waits into this one for regions with no SGPR reads.
3635 if (!WaitInstrs.empty()) {
3636 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3637 // obtain a mutable pointer to each instruction to be merged.
3638 // This is expected to be a very short walk within the same block.
3639 SmallVector<MachineInstr *> ToErase;
3640 unsigned Found = 0;
3641 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3642 End = MI->getParent()->rend();
3643 Found < WaitInstrs.size() && It != End; ++It) {
3644 MachineInstr *WaitMI = &*It;
3645 // Find next wait instruction.
3646 if (std::as_const(WaitMI) != WaitInstrs[Found])
3647 continue;
3648 Found++;
3649 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3650 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3651 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3652 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3653 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3654 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3655 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3656 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3657 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3658 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3659 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3660 ToErase.push_back(WaitMI);
3661 }
3662 assert(Found == WaitInstrs.size());
3663 for (MachineInstr *WaitMI : ToErase)
3664 WaitMI->eraseFromParent();
3665 }
3666
3667 // Add s_waitcnt_depctr after SGPR write.
3668 auto NextMI = std::next(MI->getIterator());
3669 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3670 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3671 .addImm(DepCtr);
3672
3673 // SALU write may be s_getpc in a bundle.
3674 updateGetPCBundle(NewMI);
3675
3676 return true;
3677}
3678
3679static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3680 const SIInstrInfo &TII) {
3681 MachineBasicBlock &EntryMBB = MF->front();
3682 if (EntryMBB.begin() != EntryMBB.end()) {
3683 auto &EntryMI = *EntryMBB.begin();
3684 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3685 EntryMI.getOperand(0).getImm() >= Priority)
3686 return false;
3687 }
3688
3689 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3690 .addImm(Priority);
3691 return true;
3692}
3693
3694bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3695 if (!ST.hasRequiredExportPriority())
3696 return false;
3697
3698 // Assume the following shader types will never have exports,
3699 // and avoid adding or adjusting S_SETPRIO.
3700 MachineBasicBlock *MBB = MI->getParent();
3701 MachineFunction *MF = MBB->getParent();
3702 auto CC = MF->getFunction().getCallingConv();
3703 switch (CC) {
3708 return false;
3709 default:
3710 break;
3711 }
3712
3713 const int MaxPriority = 3;
3714 const int NormalPriority = 2;
3715 const int PostExportPriority = 0;
3716
3717 auto It = MI->getIterator();
3718 switch (MI->getOpcode()) {
3719 case AMDGPU::S_ENDPGM:
3720 case AMDGPU::S_ENDPGM_SAVED:
3721 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3722 case AMDGPU::SI_RETURN_TO_EPILOG:
3723 // Ensure shader with calls raises priority at entry.
3724 // This ensures correct priority if exports exist in callee.
3725 if (MF->getFrameInfo().hasCalls())
3726 return ensureEntrySetPrio(MF, NormalPriority, TII);
3727 return false;
3728 case AMDGPU::S_SETPRIO: {
3729 // Raise minimum priority unless in workaround.
3730 auto &PrioOp = MI->getOperand(0);
3731 int Prio = PrioOp.getImm();
3732 bool InWA = (Prio == PostExportPriority) &&
3733 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3734 if (InWA || Prio >= NormalPriority)
3735 return false;
3736 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3737 return true;
3738 }
3739 default:
3740 if (!TII.isEXP(*MI))
3741 return false;
3742 break;
3743 }
3744
3745 // Check entry priority at each export (as there will only be a few).
3746 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3747 bool Changed = false;
3749 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3750
3751 auto NextMI = std::next(It);
3752 bool EndOfShader = false;
3753 if (NextMI != MBB->end()) {
3754 // Only need WA at end of sequence of exports.
3755 if (TII.isEXP(*NextMI))
3756 return Changed;
3757 // Assume appropriate S_SETPRIO after export means WA already applied.
3758 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3759 NextMI->getOperand(0).getImm() == PostExportPriority)
3760 return Changed;
3761 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3762 }
3763
3764 const DebugLoc &DL = MI->getDebugLoc();
3765
3766 // Lower priority.
3767 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3768 .addImm(PostExportPriority);
3769
3770 if (!EndOfShader) {
3771 // Wait for exports to complete.
3772 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3773 .addReg(AMDGPU::SGPR_NULL)
3774 .addImm(0);
3775 }
3776
3777 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3778 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3779
3780 if (!EndOfShader) {
3781 // Return to normal (higher) priority.
3782 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3783 .addImm(NormalPriority);
3784 }
3785
3786 return true;
3787}
3788
3789bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3790 if (!isSGetReg(MI->getOpcode()))
3791 return false;
3792
3793 const SIInstrInfo *TII = ST.getInstrInfo();
3794 switch (getHWReg(TII, *MI)) {
3795 default:
3796 return false;
3801 break;
3802 }
3803
3804 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3805 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3806 .addImm(0);
3807 return true;
3808}
3809
3810bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3811 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3812 return false;
3813
3814 const SIInstrInfo *TII = ST.getInstrInfo();
3815 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3816 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3818 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3819 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3821
3822 return true;
3823}
3824
3825bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3826 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3827 // for hazard to trigger.
3828 if (!IsHazardRecognizerMode)
3829 return false;
3830
3831 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3832 const SIInstrInfo *TII = ST.getInstrInfo();
3833 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3834 const int FlatScrBaseWaitStates = 10;
3835
3836 bool ReadsFlatScrLo =
3837 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3838 bool ReadsFlatScrHi =
3839 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3840 if (isSGetReg(MI->getOpcode())) {
3841 switch (getHWReg(TII, *MI)) {
3842 default:
3843 break;
3845 ReadsFlatScrLo = true;
3846 break;
3848 ReadsFlatScrHi = true;
3849 break;
3850 }
3851 }
3852
3853 const MachineRegisterInfo &MRI = MF.getRegInfo();
3854
3855 auto IsRegDefHazard = [&](Register Reg) -> bool {
3856 DenseSet<const MachineBasicBlock *> Visited;
3857 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3858 return MI.modifiesRegister(Reg, TRI);
3859 };
3860
3861 // This literally abuses the idea of waitstates. Instead of waitstates it
3862 // returns 1 for SGPR written and 0 otherwise.
3863 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3864 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3865 return 0;
3866 for (const MachineOperand &MO : MI.all_defs()) {
3867 if (TRI->isSGPRReg(MRI, MO.getReg()))
3868 return 1;
3869 }
3870 return 0;
3871 };
3872
3873 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3874 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3875 unsigned Wait = MI.getOperand(0).getImm();
3878 return true;
3879 }
3880 return SgprWrites >= FlatScrBaseWaitStates;
3881 };
3882
3883 return ::getWaitStatesSince(
3884 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3885 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3886 };
3887
3888 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3889 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3890 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3891 !IsRegDefHazard(AMDGPU::SGPR103)))
3892 return false;
3893
3894 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3895 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3898 return true;
3899}
3900
3901bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3902 if (!isSSetReg(MI->getOpcode()) ||
3903 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3904 return false;
3905
3906 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3907 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3908 return true;
3909}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel, const GCNSubtarget &ST)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:319
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition MCSchedule.h:35
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...