LLVM 20.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
22
23using namespace llvm;
24
25namespace {
26
27struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29
30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31 if (Arg.getAsInteger(0, Value))
32 return O.error("'" + Arg + "' value invalid for uint argument!");
33
34 if (Value > 100)
35 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36
37 return false;
38 }
39};
40
41} // end anonymous namespace
42
44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45 cl::desc("Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
47
49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50 cl::desc("Maximum function size for exhausive hazard search"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 UseVALUReadHazardExhaustiveSearch(false),
64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67}
68
70 EmittedInstrs.clear();
71}
72
75}
76
78 CurrCycleInstr = MI;
79}
80
81static bool isDivFMas(unsigned Opcode) {
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83}
84
85static bool isSGetReg(unsigned Opcode) {
86 return Opcode == AMDGPU::S_GETREG_B32;
87}
88
89static bool isSSetReg(unsigned Opcode) {
90 switch (Opcode) {
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
95 return true;
96 }
97 return false;
98}
99
100static bool isRWLane(unsigned Opcode) {
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102}
103
104static bool isRFE(unsigned Opcode) {
105 return Opcode == AMDGPU::S_RFE_B64;
106}
107
108static bool isSMovRel(unsigned Opcode) {
109 switch (Opcode) {
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
114 return true;
115 default:
116 return false;
117 }
118}
119
120static bool isDGEMM(unsigned Opcode) {
121 return AMDGPU::getMAIIsDGEMM(Opcode);
122}
123
124static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125 unsigned Opcode = MI.getOpcode();
126
127 if (!SIInstrInfo::isMAI(MI) ||
128 isDGEMM(Opcode) ||
129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131 return false;
132
133 if (!ST.hasGFX940Insts())
134 return true;
135
136 return AMDGPU::getMAIIsGFX940XDL(Opcode);
137}
138
140 const MachineInstr &MI) {
141 if (TII.isAlwaysGDS(MI.getOpcode()))
142 return true;
143
144 switch (MI.getOpcode()) {
145 case AMDGPU::S_SENDMSG:
146 case AMDGPU::S_SENDMSGHALT:
147 case AMDGPU::S_TTRACEDATA:
148 return true;
149 // These DS opcodes don't support GDS.
150 case AMDGPU::DS_NOP:
151 case AMDGPU::DS_PERMUTE_B32:
152 case AMDGPU::DS_BPERMUTE_B32:
153 return false;
154 default:
155 if (TII.isDS(MI.getOpcode())) {
156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157 AMDGPU::OpName::gds);
158 if (MI.getOperand(GDS).getImm())
159 return true;
160 }
161 return false;
162 }
163}
164
165static bool isPermlane(const MachineInstr &MI) {
166 unsigned Opcode = MI.getOpcode();
167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE64_B32 ||
169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185 AMDGPU::OpName::simm16);
186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187}
188
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 if (ST.hasNoDataDepHazard())
209 return NoHazard;
210
211 // FIXME: Should flat be considered vmem?
212 if ((SIInstrInfo::isVMEM(*MI) ||
214 && checkVMEMHazards(MI) > 0)
215 return HazardType;
216
217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221 return HazardType;
222
223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224 return HazardType;
225
226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227 return HazardType;
228
231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232 return HazardType;
233
234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235 return HazardType;
236
237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238 return HazardType;
239
240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241 return HazardType;
242
243 if (((ST.hasReadM0MovRelInterpHazard() &&
244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251 checkReadM0Hazards(MI) > 0)
252 return HazardType;
253
254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255 return HazardType;
256
257 if ((SIInstrInfo::isVMEM(*MI) ||
259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260 return HazardType;
261
262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263 return HazardType;
264
265 return NoHazard;
266}
267
269 unsigned Quantity) {
270 while (Quantity > 0) {
271 unsigned Arg = std::min(Quantity, 8u);
272 Quantity -= Arg;
273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274 .addImm(Arg - 1);
275 }
276}
277
278unsigned
279GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281 assert(TSchedModel.getWriteProcResBegin(SC) !=
282 TSchedModel.getWriteProcResEnd(SC));
283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284}
285
286void GCNHazardRecognizer::processBundle() {
287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
289 // Check bundled MachineInstr's for hazards.
290 for (; MI != E && MI->isInsideBundle(); ++MI) {
291 CurrCycleInstr = &*MI;
292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293
294 if (IsHazardRecognizerMode) {
295 fixHazards(CurrCycleInstr);
296
297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298 }
299
300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301 // include the bundled MI directly after, only add a maximum of
302 // (MaxLookAhead - 1) noops to EmittedInstrs.
303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304 EmittedInstrs.push_front(nullptr);
305
306 EmittedInstrs.push_front(CurrCycleInstr);
307 EmittedInstrs.resize(MaxLookAhead);
308 }
309 CurrCycleInstr = nullptr;
310}
311
312void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313 assert(IsHazardRecognizerMode);
314
315 unsigned NumPreNoops = PreEmitNoops(MI);
316 EmitNoops(NumPreNoops);
317 if (MI->isInsideBundle())
318 insertNoopsInBundle(MI, TII, NumPreNoops);
319 else
320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321 NumPreNoops);
323 AdvanceCycle();
324}
325
327 IsHazardRecognizerMode = true;
328 CurrCycleInstr = MI;
329 unsigned W = PreEmitNoopsCommon(MI);
330 fixHazards(MI);
331 CurrCycleInstr = nullptr;
332 return W;
333}
334
336 if (MI->isBundle())
337 return 0;
338
339 int WaitStates = 0;
340
342 return std::max(WaitStates, checkSMRDHazards(MI));
343
344 if (ST.hasNSAtoVMEMBug())
345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346
347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348
349 if (ST.hasNoDataDepHazard())
350 return WaitStates;
351
353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354
356 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357
359 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360
361 if (isDivFMas(MI->getOpcode()))
362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363
364 if (isRWLane(MI->getOpcode()))
365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366
369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371
372 if (MI->isInlineAsm())
373 return std::max(WaitStates, checkInlineAsmHazards(MI));
374
375 if (isSGetReg(MI->getOpcode()))
376 return std::max(WaitStates, checkGetRegHazards(MI));
377
378 if (isSSetReg(MI->getOpcode()))
379 return std::max(WaitStates, checkSetRegHazards(MI));
380
381 if (isRFE(MI->getOpcode()))
382 return std::max(WaitStates, checkRFEHazards(MI));
383
384 if ((ST.hasReadM0MovRelInterpHazard() &&
385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392 return std::max(WaitStates, checkReadM0Hazards(MI));
393
395 return std::max(WaitStates, checkMAIHazards(MI));
396
397 if (SIInstrInfo::isVMEM(*MI) ||
400 return std::max(WaitStates, checkMAILdStHazards(MI));
401
402 if (ST.hasGFX950Insts() && isPermlane(*MI))
403 return std::max(WaitStates, checkPermlaneHazards(MI));
404
405 return WaitStates;
406}
407
409 EmittedInstrs.push_front(nullptr);
410}
411
413 // When the scheduler detects a stall, it will call AdvanceCycle() without
414 // emitting any instructions.
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(nullptr);
417 return;
418 }
419
420 if (CurrCycleInstr->isBundle()) {
421 processBundle();
422 return;
423 }
424
425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426 if (!NumWaitStates) {
427 CurrCycleInstr = nullptr;
428 return;
429 }
430
431 // Keep track of emitted instructions
432 EmittedInstrs.push_front(CurrCycleInstr);
433
434 // Add a nullptr for each additional wait state after the first. Make sure
435 // not to add more than getMaxLookAhead() items to the list, since we
436 // truncate the list to that size right after this loop.
437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438 i < e; ++i) {
439 EmittedInstrs.push_front(nullptr);
440 }
441
442 // getMaxLookahead() is the largest number of wait states we will ever need
443 // to insert, so there is no point in keeping track of more than that many
444 // wait states.
445 EmittedInstrs.resize(getMaxLookAhead());
446
447 CurrCycleInstr = nullptr;
448}
449
451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452}
453
454//===----------------------------------------------------------------------===//
455// Helper Functions
456//===----------------------------------------------------------------------===//
457
458using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459
460using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462
463// Search for a hazard in a block and its predecessors.
464template <typename StateT>
465static bool
466hasHazard(StateT State,
467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469 const MachineBasicBlock *MBB,
472 for (auto E = MBB->instr_rend(); I != E; ++I) {
473 // No need to look at parent BUNDLE instructions.
474 if (I->isBundle())
475 continue;
476
477 switch (IsHazard(State, *I)) {
478 case HazardFound:
479 return true;
480 case HazardExpired:
481 return false;
482 default:
483 // Continue search
484 break;
485 }
486
487 if (I->isInlineAsm() || I->isMetaInstruction())
488 continue;
489
490 UpdateState(State, *I);
491 }
492
493 for (MachineBasicBlock *Pred : MBB->predecessors()) {
494 if (!Visited.insert(Pred).second)
495 continue;
496
497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498 Visited))
499 return true;
500 }
501
502 return false;
503}
504
505// Returns a minimum wait states since \p I walking all predecessors.
506// Only scans until \p IsExpired does not return true.
507// Can only be run in a hazard recognizer mode.
513 for (auto E = MBB->instr_rend(); I != E; ++I) {
514 // Don't add WaitStates for parent BUNDLE instructions.
515 if (I->isBundle())
516 continue;
517
518 if (IsHazard(*I))
519 return WaitStates;
520
521 if (I->isInlineAsm())
522 continue;
523
524 WaitStates += GetNumWaitStates(*I);
525
526 if (IsExpired(*I, WaitStates))
527 return std::numeric_limits<int>::max();
528 }
529
530 int MinWaitStates = std::numeric_limits<int>::max();
531 for (MachineBasicBlock *Pred : MBB->predecessors()) {
532 if (!Visited.insert(Pred).second)
533 continue;
534
535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536 IsExpired, Visited, GetNumWaitStates);
537
538 MinWaitStates = std::min(MinWaitStates, W);
539 }
540
541 return MinWaitStates;
542}
543
545 const MachineInstr *MI, IsExpiredFn IsExpired) {
547 return getWaitStatesSince(IsHazard, MI->getParent(),
548 std::next(MI->getReverseIterator()),
549 0, IsExpired, Visited);
550}
551
552int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553 if (IsHazardRecognizerMode) {
554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555 return WaitStates >= Limit;
556 };
557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558 }
559
560 int WaitStates = 0;
561 for (MachineInstr *MI : EmittedInstrs) {
562 if (MI) {
563 if (IsHazard(*MI))
564 return WaitStates;
565
566 if (MI->isInlineAsm())
567 continue;
568 }
569 ++WaitStates;
570
571 if (WaitStates >= Limit)
572 break;
573 }
574 return std::numeric_limits<int>::max();
575}
576
577int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578 IsHazardFn IsHazardDef,
579 int Limit) {
580 const SIRegisterInfo *TRI = ST.getRegisterInfo();
581
582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584 };
585
586 return getWaitStatesSince(IsHazardFn, Limit);
587}
588
589int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590 int Limit) {
591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593 };
594
595 return getWaitStatesSince(IsHazardFn, Limit);
596}
597
598//===----------------------------------------------------------------------===//
599// No-op Hazard Detection
600//===----------------------------------------------------------------------===//
601
602static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603 MCRegister Reg) {
604 for (MCRegUnit Unit : TRI.regunits(Reg))
605 BV.set(Unit);
606}
607
608static void addRegsToSet(const SIRegisterInfo &TRI,
610 BitVector &DefSet, BitVector &UseSet) {
611 for (const MachineOperand &Op : Ops) {
612 if (Op.isReg())
613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614 }
615}
616
617void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619}
620
622 return !SIInstrInfo::isSMRD(*MI);
623}
624
627}
628
629int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630 // SMEM soft clause are only present on VI+, and only matter if xnack is
631 // enabled.
632 if (!ST.isXNACKEnabled())
633 return 0;
634
635 bool IsSMRD = TII.isSMRD(*MEM);
636
637 resetClause();
638
639 // A soft-clause is any group of consecutive SMEM instructions. The
640 // instructions in this group may return out of order and/or may be
641 // replayed (i.e. the same instruction issued more than once).
642 //
643 // In order to handle these situations correctly we need to make sure that
644 // when a clause has more than one instruction, no instruction in the clause
645 // writes to a register that is read by another instruction in the clause
646 // (including itself). If we encounter this situation, we need to break the
647 // clause by inserting a non SMEM instruction.
648
649 for (MachineInstr *MI : EmittedInstrs) {
650 // When we hit a non-SMEM instruction then we have passed the start of the
651 // clause and we can stop.
652 if (!MI)
653 break;
654
656 break;
657
658 addClauseInst(*MI);
659 }
660
661 if (ClauseDefs.none())
662 return 0;
663
664 // We need to make sure not to put loads and stores in the same clause if they
665 // use the same address. For now, just start a new clause whenever we see a
666 // store.
667 if (MEM->mayStore())
668 return 1;
669
670 addClauseInst(*MEM);
671
672 // If the set of defs and uses intersect then we cannot add this instruction
673 // to the clause, so we have a hazard.
674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675}
676
677int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678 int WaitStatesNeeded = 0;
679
680 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681
682 // This SMRD hazard only affects SI.
683 if (!ST.hasSMRDReadVALUDefHazard())
684 return WaitStatesNeeded;
685
686 // A read of an SGPR by SMRD instruction requires 4 wait states when the
687 // SGPR was written by a VALU instruction.
688 int SmrdSgprWaitStates = 4;
689 auto IsHazardDefFn = [this](const MachineInstr &MI) {
690 return TII.isVALU(MI);
691 };
692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693 return TII.isSALU(MI);
694 };
695
696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697
698 for (const MachineOperand &Use : SMRD->uses()) {
699 if (!Use.isReg())
700 continue;
701 int WaitStatesNeededForUse =
702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703 SmrdSgprWaitStates);
704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705
706 // This fixes what appears to be undocumented hardware behavior in SI where
707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708 // needs some number of nops in between. We don't know how many we need, but
709 // let's use 4. This wasn't discovered before probably because the only
710 // case when this happens is when we expand a 64-bit pointer into a full
711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712 // probably never encountered in the closed-source land.
713 if (IsBufferSMRD) {
714 int WaitStatesNeededForUse =
715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716 IsBufferHazardDefFn,
717 SmrdSgprWaitStates);
718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719 }
720 }
721
722 return WaitStatesNeeded;
723}
724
725int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
727 return 0;
728
729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730
731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732 // SGPR was written by a VALU Instruction.
733 const int VmemSgprWaitStates = 5;
734 auto IsHazardDefFn = [this](const MachineInstr &MI) {
735 return TII.isVALU(MI);
736 };
737 for (const MachineOperand &Use : VMEM->uses()) {
738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739 continue;
740
741 int WaitStatesNeededForUse =
742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743 VmemSgprWaitStates);
744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745 }
746 return WaitStatesNeeded;
747}
748
749int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750 const SIRegisterInfo *TRI = ST.getRegisterInfo();
751 const SIInstrInfo *TII = ST.getInstrInfo();
752
753 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754 int DppVgprWaitStates = 2;
755 int DppExecWaitStates = 5;
756 int WaitStatesNeeded = 0;
757 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758 return TII->isVALU(MI);
759 };
760
761 for (const MachineOperand &Use : DPP->uses()) {
762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763 continue;
764 int WaitStatesNeededForUse =
765 DppVgprWaitStates - getWaitStatesSinceDef(
766 Use.getReg(),
767 [](const MachineInstr &) { return true; },
768 DppVgprWaitStates);
769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770 }
771
772 WaitStatesNeeded = std::max(
773 WaitStatesNeeded,
774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775 DppExecWaitStates));
776
777 return WaitStatesNeeded;
778}
779
780int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781 const SIInstrInfo *TII = ST.getInstrInfo();
782
783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784 // instruction.
785 const int DivFMasWaitStates = 4;
786 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787 return TII->isVALU(MI);
788 };
789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790 DivFMasWaitStates);
791
792 return DivFMasWaitStates - WaitStatesNeeded;
793}
794
795int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796 const SIInstrInfo *TII = ST.getInstrInfo();
797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798
799 const int GetRegWaitStates = 2;
800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801 return GetRegHWReg == getHWReg(TII, MI);
802 };
803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804
805 return GetRegWaitStates - WaitStatesNeeded;
806}
807
808int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809 const SIInstrInfo *TII = ST.getInstrInfo();
810 unsigned HWReg = getHWReg(TII, *SetRegInstr);
811
812 const int SetRegWaitStates = ST.getSetRegWaitStates();
813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814 return HWReg == getHWReg(TII, MI);
815 };
816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817 return SetRegWaitStates - WaitStatesNeeded;
818}
819
820int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821 if (!MI.mayStore())
822 return -1;
823
824 const SIInstrInfo *TII = ST.getInstrInfo();
825 unsigned Opcode = MI.getOpcode();
826 const MCInstrDesc &Desc = MI.getDesc();
827
828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829 int VDataRCID = -1;
830 if (VDataIdx != -1)
831 VDataRCID = Desc.operands()[VDataIdx].RegClass;
832
833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834 // There is no hazard if the instruction does not use vector regs
835 // (like wbinvl1)
836 if (VDataIdx == -1)
837 return -1;
838 // For MUBUF/MTBUF instructions this hazard only exists if the
839 // instruction is not using a register in the soffset field.
840 const MachineOperand *SOffset =
841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842 // If we have no soffset operand, then assume this field has been
843 // hardcoded to zero.
844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845 (!SOffset || !SOffset->isReg()))
846 return VDataIdx;
847 }
848
849 // MIMG instructions create a hazard if they don't use a 256-bit T# and
850 // the store size is greater than 8 bytes and they have more than two bits
851 // of their dmask set.
852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853 if (TII->isMIMG(MI)) {
854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855 assert(SRsrcIdx != -1 &&
856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857 (void)SRsrcIdx;
858 }
859
860 if (TII->isFLAT(MI)) {
861 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863 return DataIdx;
864 }
865
866 return -1;
867}
868
869int
870GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
871 const MachineRegisterInfo &MRI) {
872 // Helper to check for the hazard where VMEM instructions that store more than
873 // 8 bytes can have there store data over written by the next instruction.
874 const SIRegisterInfo *TRI = ST.getRegisterInfo();
875
876 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
877 int WaitStatesNeeded = 0;
878
879 if (!TRI->isVectorRegister(MRI, Def.getReg()))
880 return WaitStatesNeeded;
881 Register Reg = Def.getReg();
882 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
883 int DataIdx = createsVALUHazard(MI);
884 return DataIdx >= 0 &&
885 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
886 };
887
888 int WaitStatesNeededForDef =
889 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
890 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
891
892 return WaitStatesNeeded;
893}
894
895/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
896/// pack the computed value into correct bit position of the dest register. This
897/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
898/// dst_sel that is not aligned to the register. This function analayzes the \p
899/// MI and \returns an operand with dst forwarding issue, or nullptr if
900/// none exists.
901static const MachineOperand *
904 return nullptr;
905
906 const SIInstrInfo *TII = ST.getInstrInfo();
907
908 unsigned Opcode = MI.getOpcode();
909
910 // There are three different types of instructions
911 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
912 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
913 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
914 // op_sel[3:2]
915 // != 0
916 if (SIInstrInfo::isSDWA(MI)) {
917 // Type 1: SDWA with dst_sel != DWORD
918 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
920 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
921 }
922
923 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
924 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
925 // Type 2: VOP3 which write the hi bits
926 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
928 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
929
930 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
931 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
932 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
934 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
935 }
936
937 // Special case: nop is required for all the opsel values for fp4 sr variant
938 // cvt scale instructions
939 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
940 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
941
942 return nullptr;
943}
944
945/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
946/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
947/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
949 const MachineOperand *Dst,
950 const SIRegisterInfo *TRI) {
951 // We must consider implicit reads of the VALU. SDWA with dst_sel and
952 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
953 // and we must account for that hazard.
954 // We also must account for WAW hazards. In particular, WAW with dest
955 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
956 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
957 // check for ECC. Without accounting for this hazard, the ECC will be
958 // wrong.
959 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
960 // complete zeroesHigh16BitsOfDest)
961 for (auto &Operand : VALU->operands()) {
962 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
963 return true;
964 }
965 }
966 return false;
967}
968
969int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
970 int WaitStatesNeeded = 0;
971
973 const int TransDefWaitstates = 1;
974
975 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
977 return false;
978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
979 const SIInstrInfo *TII = ST.getInstrInfo();
980 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
981
982 for (const MachineOperand &Use : VALU->explicit_uses()) {
983 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
984 return true;
985 }
986
987 return false;
988 };
989
990 int WaitStatesNeededForDef =
991 TransDefWaitstates -
992 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
994 }
995
997 const int Shift16DefWaitstates = 1;
998
999 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1000 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1001 const MachineOperand *ForwardedDst =
1002 getDstSelForwardingOperand(ProducerMI, ST);
1003 if (ForwardedDst) {
1004 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1005 }
1006
1007 if (ProducerMI.isInlineAsm()) {
1008 // Assume inline asm has dst forwarding hazard
1009 for (auto &Def : ProducerMI.all_defs()) {
1010 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1011 return true;
1012 }
1013 }
1014
1015 return false;
1016 };
1017
1018 int WaitStatesNeededForDef =
1019 Shift16DefWaitstates -
1020 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1021 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1022 }
1023
1024 if (ST.hasVDecCoExecHazard()) {
1025 const int VALUWriteSGPRVALUReadWaitstates = 2;
1026 const int VALUWriteEXECRWLane = 4;
1027 const int VALUWriteVGPRReadlaneRead = 1;
1028
1029 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1030 const MachineRegisterInfo &MRI = MF.getRegInfo();
1032 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1033 if (!SIInstrInfo::isVALU(MI))
1034 return false;
1035 return MI.modifiesRegister(UseReg, TRI);
1036 };
1037
1038 for (const MachineOperand &Use : VALU->explicit_uses()) {
1039 if (!Use.isReg())
1040 continue;
1041
1042 UseReg = Use.getReg();
1043 if (TRI->isSGPRReg(MRI, UseReg)) {
1044 int WaitStatesNeededForDef =
1045 VALUWriteSGPRVALUReadWaitstates -
1046 getWaitStatesSince(IsVALUDefSGPRFn,
1047 VALUWriteSGPRVALUReadWaitstates);
1048 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1049 }
1050 }
1051
1052 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1053 UseReg = AMDGPU::VCC;
1054 int WaitStatesNeededForDef =
1055 VALUWriteSGPRVALUReadWaitstates -
1056 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1057 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1058 }
1059
1060 switch (VALU->getOpcode()) {
1061 case AMDGPU::V_READLANE_B32:
1062 case AMDGPU::V_READFIRSTLANE_B32: {
1063 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1064 UseReg = Src->getReg();
1065 int WaitStatesNeededForDef =
1066 VALUWriteVGPRReadlaneRead -
1067 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1068 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1069 }
1070 [[fallthrough]];
1071 case AMDGPU::V_WRITELANE_B32: {
1072 UseReg = AMDGPU::EXEC;
1073 int WaitStatesNeededForDef =
1074 VALUWriteEXECRWLane -
1075 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1076 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1077 break;
1078 }
1079 default:
1080 break;
1081 }
1082 }
1083
1084 // This checks for the hazard where VMEM instructions that store more than
1085 // 8 bytes can have there store data over written by the next instruction.
1086 if (!ST.has12DWordStoreHazard())
1087 return WaitStatesNeeded;
1088
1089 const MachineRegisterInfo &MRI = MF.getRegInfo();
1090
1091 for (const MachineOperand &Def : VALU->defs()) {
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1093 }
1094
1095 return WaitStatesNeeded;
1096}
1097
1098int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1099 // This checks for hazards associated with inline asm statements.
1100 // Since inline asms can contain just about anything, we use this
1101 // to call/leverage other check*Hazard routines. Note that
1102 // this function doesn't attempt to address all possible inline asm
1103 // hazards (good luck), but is a collection of what has been
1104 // problematic thus far.
1105
1106 // see checkVALUHazards()
1109 return 0;
1110
1111 const MachineRegisterInfo &MRI = MF.getRegInfo();
1112 int WaitStatesNeeded = 0;
1113
1114 for (const MachineOperand &Op :
1116 if (Op.isReg() && Op.isDef()) {
1117 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1118 continue;
1119
1120 if (ST.has12DWordStoreHazard()) {
1121 WaitStatesNeeded =
1122 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1123 }
1124 }
1125 }
1126
1127 if (ST.hasDstSelForwardingHazard()) {
1128 const int Shift16DefWaitstates = 1;
1129
1130 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1131 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1132 // Assume inline asm reads the dst
1133 if (Dst)
1134 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1135 IA->readsRegister(Dst->getReg(), &TRI);
1136
1137 if (ProducerMI.isInlineAsm()) {
1138 // If MI is inline asm, assume it has dst forwarding hazard
1139 for (auto &Def : ProducerMI.all_defs()) {
1140 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1141 IA->readsRegister(Def.getReg(), &TRI)) {
1142 return true;
1143 }
1144 }
1145 }
1146
1147 return false;
1148 };
1149
1150 int WaitStatesNeededForDef =
1151 Shift16DefWaitstates -
1152 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1153 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1154 }
1155
1156 return WaitStatesNeeded;
1157}
1158
1159int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1160 const SIInstrInfo *TII = ST.getInstrInfo();
1161 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1162 const MachineRegisterInfo &MRI = MF.getRegInfo();
1163
1164 const MachineOperand *LaneSelectOp =
1165 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1166
1167 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1168 return 0;
1169
1170 Register LaneSelectReg = LaneSelectOp->getReg();
1171 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1172
1173 const int RWLaneWaitStates = 4;
1174 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1175 RWLaneWaitStates);
1176 return RWLaneWaitStates - WaitStatesSince;
1177}
1178
1179int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1180 if (!ST.hasRFEHazards())
1181 return 0;
1182
1183 const SIInstrInfo *TII = ST.getInstrInfo();
1184
1185 const int RFEWaitStates = 1;
1186
1187 auto IsHazardFn = [TII](const MachineInstr &MI) {
1188 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1189 };
1190 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1191 return RFEWaitStates - WaitStatesNeeded;
1192}
1193
1194int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1195 const SIInstrInfo *TII = ST.getInstrInfo();
1196 const int ReadM0WaitStates = 1;
1197 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1198 return ReadM0WaitStates -
1199 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1200}
1201
1202void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1203 fixVMEMtoScalarWriteHazards(MI);
1204 fixVcmpxPermlaneHazards(MI);
1205 fixSMEMtoVectorWriteHazards(MI);
1206 fixVcmpxExecWARHazard(MI);
1207 fixLdsBranchVmemWARHazard(MI);
1208 if (ST.hasLdsDirect()) {
1209 fixLdsDirectVALUHazard(MI);
1210 fixLdsDirectVMEMHazard(MI);
1211 }
1212 fixVALUPartialForwardingHazard(MI);
1213 fixVALUTransUseHazard(MI);
1214 fixWMMAHazards(MI);
1215 fixShift64HighRegBug(MI);
1216 fixVALUMaskWriteHazard(MI);
1217 fixVALUReadSGPRHazard(MI);
1218 fixRequiredExportPriority(MI);
1219}
1220
1222 const MachineInstr &MI) {
1223 return (TII.isVOPC(MI) ||
1224 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1225 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1226}
1227
1228bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1229 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1230 return false;
1231
1232 const SIInstrInfo *TII = ST.getInstrInfo();
1233 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1234 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1235 return isVCmpXWritesExec(*TII, *TRI, MI);
1236 };
1237
1238 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1239 unsigned Opc = MI.getOpcode();
1240 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1241 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1242 };
1243
1244 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1245 std::numeric_limits<int>::max())
1246 return false;
1247
1248 // V_NOP will be discarded by SQ.
1249 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1250 // which is always a VGPR and available.
1251 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1252 Register Reg = Src0->getReg();
1253 bool IsUndef = Src0->isUndef();
1254 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1255 TII->get(AMDGPU::V_MOV_B32_e32))
1256 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1257 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1258
1259 return true;
1260}
1261
1262bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1264 return false;
1266
1268 return false;
1269
1270 if (MI->getNumDefs() == 0)
1271 return false;
1272
1273 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1274
1275 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1278 return false;
1279
1280 for (const MachineOperand &Def : MI->defs()) {
1281 const MachineOperand *Op =
1282 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1283 if (!Op)
1284 continue;
1285 return true;
1286 }
1287 return false;
1288 };
1289
1290 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1291 return SIInstrInfo::isVALU(MI) ||
1292 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1293 !MI.getOperand(0).getImm()) ||
1294 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1295 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1296 };
1297
1298 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1299 std::numeric_limits<int>::max())
1300 return false;
1301
1302 const SIInstrInfo *TII = ST.getInstrInfo();
1303 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1304 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1306 return true;
1307}
1308
1309bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1311 return false;
1313
1314 if (!SIInstrInfo::isVALU(*MI))
1315 return false;
1316
1317 unsigned SDSTName;
1318 switch (MI->getOpcode()) {
1319 case AMDGPU::V_READLANE_B32:
1320 case AMDGPU::V_READFIRSTLANE_B32:
1321 SDSTName = AMDGPU::OpName::vdst;
1322 break;
1323 default:
1324 SDSTName = AMDGPU::OpName::sdst;
1325 break;
1326 }
1327
1328 const SIInstrInfo *TII = ST.getInstrInfo();
1329 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1330 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1331 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1332 if (!SDST) {
1333 for (const auto &MO : MI->implicit_operands()) {
1334 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1335 SDST = &MO;
1336 break;
1337 }
1338 }
1339 }
1340
1341 if (!SDST)
1342 return false;
1343
1344 const Register SDSTReg = SDST->getReg();
1345 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1346 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1347 };
1348
1349 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1350 if (TII->isSALU(MI)) {
1351 switch (MI.getOpcode()) {
1352 case AMDGPU::S_SETVSKIP:
1353 case AMDGPU::S_VERSION:
1354 case AMDGPU::S_WAITCNT_VSCNT:
1355 case AMDGPU::S_WAITCNT_VMCNT:
1356 case AMDGPU::S_WAITCNT_EXPCNT:
1357 // These instructions cannot not mitigate the hazard.
1358 return false;
1359 case AMDGPU::S_WAITCNT_LGKMCNT:
1360 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1361 return (MI.getOperand(1).getImm() == 0) &&
1362 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1363 case AMDGPU::S_WAITCNT: {
1364 const int64_t Imm = MI.getOperand(0).getImm();
1366 // DsCnt corresponds to LGKMCnt here.
1367 return (Decoded.DsCnt == 0);
1368 }
1369 default:
1370 // SOPP instructions cannot mitigate the hazard.
1371 if (TII->isSOPP(MI))
1372 return false;
1373 // At this point the SALU can be assumed to mitigate the hazard
1374 // because either:
1375 // (a) it is independent of the at risk SMEM (breaking chain),
1376 // or
1377 // (b) it is dependent on the SMEM, in which case an appropriate
1378 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1379 // SMEM instruction.
1380 return true;
1381 }
1382 }
1383 return false;
1384 };
1385
1386 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1387 std::numeric_limits<int>::max())
1388 return false;
1389
1390 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1391 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1392 .addImm(0);
1393 return true;
1394}
1395
1396bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1397 if (!ST.hasVcmpxExecWARHazard())
1398 return false;
1400
1401 if (!SIInstrInfo::isVALU(*MI))
1402 return false;
1403
1404 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1405 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1406 return false;
1407
1408 auto IsHazardFn = [TRI](const MachineInstr &I) {
1410 return false;
1411 return I.readsRegister(AMDGPU::EXEC, TRI);
1412 };
1413
1414 const SIInstrInfo *TII = ST.getInstrInfo();
1415 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1416 if (SIInstrInfo::isVALU(MI)) {
1417 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1418 return true;
1419 for (auto MO : MI.implicit_operands())
1420 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1421 return true;
1422 }
1423 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1424 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1425 return true;
1426 return false;
1427 };
1428
1429 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1430 std::numeric_limits<int>::max())
1431 return false;
1432
1433 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1434 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1436 return true;
1437}
1438
1440 const GCNSubtarget &ST) {
1441 if (!ST.hasLdsBranchVmemWARHazard())
1442 return false;
1443
1444 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1445 // instructions need to appear in the same function.
1446 bool HasLds = false;
1447 bool HasVmem = false;
1448 for (auto &MBB : MF) {
1449 for (auto &MI : MBB) {
1450 HasLds |= SIInstrInfo::isDS(MI);
1451 HasVmem |=
1453 if (HasLds && HasVmem)
1454 return true;
1455 }
1456 }
1457 return false;
1458}
1459
1461 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1462 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1463 !I.getOperand(1).getImm();
1464}
1465
1466bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1467 if (!RunLdsBranchVmemWARHazardFixup)
1468 return false;
1469
1472
1473 auto IsHazardInst = [](const MachineInstr &MI) {
1474 if (SIInstrInfo::isDS(MI))
1475 return 1;
1477 return 2;
1478 return 0;
1479 };
1480
1481 auto InstType = IsHazardInst(*MI);
1482 if (!InstType)
1483 return false;
1484
1485 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1486 return IsHazardInst(I) || isStoreCountWaitZero(I);
1487 };
1488
1489 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1490 if (!I.isBranch())
1491 return false;
1492
1493 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1494 auto InstType2 = IsHazardInst(I);
1495 return InstType2 && InstType != InstType2;
1496 };
1497
1498 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1499 auto InstType2 = IsHazardInst(I);
1500 if (InstType == InstType2)
1501 return true;
1502
1503 return isStoreCountWaitZero(I);
1504 };
1505
1506 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1507 std::numeric_limits<int>::max();
1508 };
1509
1510 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1511 std::numeric_limits<int>::max())
1512 return false;
1513
1514 const SIInstrInfo *TII = ST.getInstrInfo();
1515 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1516 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1517 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1518 .addImm(0);
1519
1520 return true;
1521}
1522
1523bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1525 return false;
1526
1527 const int NoHazardWaitStates = 15;
1528 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1529 const Register VDSTReg = VDST->getReg();
1530
1531 bool VisitedTrans = false;
1532 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1533 if (!SIInstrInfo::isVALU(I))
1534 return false;
1535 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1536 // Cover both WAR and WAW
1537 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1538 };
1539 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1540 if (WaitStates >= NoHazardWaitStates)
1541 return true;
1542 // Instructions which cause va_vdst==0 expire hazard
1545 };
1546 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1547 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1548 };
1549
1551 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1552 std::next(MI->getReverseIterator()), 0,
1553 IsExpiredFn, Visited, GetWaitStatesFn);
1554
1555 // Transcendentals can execute in parallel to other VALUs.
1556 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1557 if (VisitedTrans)
1558 Count = 0;
1559
1560 MachineOperand *WaitVdstOp =
1561 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1562 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1563
1564 return true;
1565}
1566
1567bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1569 return false;
1570
1571 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1572 const Register VDSTReg = VDST->getReg();
1573
1574 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1577 return false;
1578 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1579 };
1580 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1581 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1582 // according to the type of VMEM instruction.
1583 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1585 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1586 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1587 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1588 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1589 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1590 };
1591
1592 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1593 std::numeric_limits<int>::max())
1594 return false;
1595
1596 if (LdsdirCanWait) {
1597 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1598 } else {
1599 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1600 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1602 }
1603
1604 return true;
1605}
1606
1607bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1609 return false;
1611
1612 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1613 return false;
1614
1616
1617 for (const MachineOperand &Use : MI->explicit_uses()) {
1618 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1619 SrcVGPRs.insert(Use.getReg());
1620 }
1621
1622 // Only applies with >= 2 unique VGPR sources
1623 if (SrcVGPRs.size() <= 1)
1624 return false;
1625
1626 // Look for the following pattern:
1627 // Va <- VALU [PreExecPos]
1628 // intv1
1629 // Exec <- SALU [ExecPos]
1630 // intv2
1631 // Vb <- VALU [PostExecPos]
1632 // intv3
1633 // MI Va, Vb (WaitState = 0)
1634 //
1635 // Where:
1636 // intv1 + intv2 <= 2 VALUs
1637 // intv3 <= 4 VALUs
1638 //
1639 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1640
1641 const int Intv1plus2MaxVALUs = 2;
1642 const int Intv3MaxVALUs = 4;
1643 const int IntvMaxVALUs = 6;
1644 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1645
1646 struct StateType {
1648 int ExecPos = std::numeric_limits<int>::max();
1649 int VALUs = 0;
1650 };
1651
1652 StateType State;
1653
1654 // This overloads expiry testing with all the hazard detection
1655 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1656 // Too many VALU states have passed
1657 if (State.VALUs > NoHazardVALUWaitStates)
1658 return HazardExpired;
1659
1660 // Instructions which cause va_vdst==0 expire hazard
1663 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1664 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1665 return HazardExpired;
1666
1667 // Track registers writes
1668 bool Changed = false;
1669 if (SIInstrInfo::isVALU(I)) {
1670 for (Register Src : SrcVGPRs) {
1671 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1672 State.DefPos[Src] = State.VALUs;
1673 Changed = true;
1674 }
1675 }
1676 } else if (SIInstrInfo::isSALU(I)) {
1677 if (State.ExecPos == std::numeric_limits<int>::max()) {
1678 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1679 State.ExecPos = State.VALUs;
1680 Changed = true;
1681 }
1682 }
1683 }
1684
1685 // Early expiration: too many VALUs in intv3
1686 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1687 return HazardExpired;
1688
1689 // Only evaluate state if something changed
1690 if (!Changed)
1691 return NoHazardFound;
1692
1693 // Determine positions of VALUs pre/post exec change
1694 if (State.ExecPos == std::numeric_limits<int>::max())
1695 return NoHazardFound;
1696
1697 int PreExecPos = std::numeric_limits<int>::max();
1698 int PostExecPos = std::numeric_limits<int>::max();
1699
1700 for (auto Entry : State.DefPos) {
1701 int DefVALUs = Entry.second;
1702 if (DefVALUs != std::numeric_limits<int>::max()) {
1703 if (DefVALUs >= State.ExecPos)
1704 PreExecPos = std::min(PreExecPos, DefVALUs);
1705 else
1706 PostExecPos = std::min(PostExecPos, DefVALUs);
1707 }
1708 }
1709
1710 // Need a VALUs post exec change
1711 if (PostExecPos == std::numeric_limits<int>::max())
1712 return NoHazardFound;
1713
1714 // Too many VALUs in intv3?
1715 int Intv3VALUs = PostExecPos;
1716 if (Intv3VALUs > Intv3MaxVALUs)
1717 return HazardExpired;
1718
1719 // Too many VALUs in intv2?
1720 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1721 if (Intv2VALUs > Intv1plus2MaxVALUs)
1722 return HazardExpired;
1723
1724 // Need a VALUs pre exec change
1725 if (PreExecPos == std::numeric_limits<int>::max())
1726 return NoHazardFound;
1727
1728 // Too many VALUs in intv1?
1729 int Intv1VALUs = PreExecPos - State.ExecPos;
1730 if (Intv1VALUs > Intv1plus2MaxVALUs)
1731 return HazardExpired;
1732
1733 // Too many VALUs in intv1 + intv2
1734 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1735 return HazardExpired;
1736
1737 return HazardFound;
1738 };
1739 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1741 State.VALUs += 1;
1742 };
1743
1745 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1746 std::next(MI->getReverseIterator()), Visited))
1747 return false;
1748
1749 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1750 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1751 .addImm(0x0fff);
1752
1753 return true;
1754}
1755
1756bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1757 if (!ST.hasVALUTransUseHazard())
1758 return false;
1760
1761 if (!SIInstrInfo::isVALU(*MI))
1762 return false;
1763
1764 SmallSet<Register, 4> SrcVGPRs;
1765
1766 for (const MachineOperand &Use : MI->explicit_uses()) {
1767 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1768 SrcVGPRs.insert(Use.getReg());
1769 }
1770
1771 // Look for the following pattern:
1772 // Va <- TRANS VALU
1773 // intv
1774 // MI Va (WaitState = 0)
1775 //
1776 // Where:
1777 // intv <= 5 VALUs / 1 TRANS
1778 //
1779 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1780
1781 const int IntvMaxVALUs = 5;
1782 const int IntvMaxTRANS = 1;
1783
1784 struct StateType {
1785 int VALUs = 0;
1786 int TRANS = 0;
1787 };
1788
1789 StateType State;
1790
1791 // This overloads expiry testing with all the hazard detection
1792 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1793 // Too many VALU states have passed
1794 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1795 return HazardExpired;
1796
1797 // Instructions which cause va_vdst==0 expire hazard
1800 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1801 I.getOperand(0).getImm() == 0x0fff))
1802 return HazardExpired;
1803
1804 // Track registers writes
1805 if (SIInstrInfo::isTRANS(I)) {
1806 for (Register Src : SrcVGPRs) {
1807 if (I.modifiesRegister(Src, &TRI)) {
1808 return HazardFound;
1809 }
1810 }
1811 }
1812
1813 return NoHazardFound;
1814 };
1815 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1817 State.VALUs += 1;
1819 State.TRANS += 1;
1820 };
1821
1823 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1824 std::next(MI->getReverseIterator()), Visited))
1825 return false;
1826
1827 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1828 // avoided.
1829 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1830 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1832
1833 return true;
1834}
1835
1836bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1838 return false;
1839
1840 const SIInstrInfo *TII = ST.getInstrInfo();
1841 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1842
1843 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1845 return false;
1846
1847 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1848 // with the dest(matrix D) of the previous wmma.
1849 const Register CurSrc0Reg =
1850 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1851 const Register CurSrc1Reg =
1852 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1853
1854 const Register PrevDstReg =
1855 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1856
1857 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1858 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1859 return true;
1860 }
1861
1862 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1863 // but Index can't overlap with PrevDstReg.
1864 if (AMDGPU::isGFX12Plus(ST)) {
1865 if (SIInstrInfo::isSWMMAC(*MI)) {
1866 const Register CurIndex =
1867 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1868 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1869 return true;
1870 }
1871 return false;
1872 }
1873
1874 return false;
1875 };
1876
1877 auto IsExpiredFn = [](const MachineInstr &I, int) {
1878 return SIInstrInfo::isVALU(I);
1879 };
1880
1881 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1882 std::numeric_limits<int>::max())
1883 return false;
1884
1885 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1886
1887 return true;
1888}
1889
1890bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1891 if (!ST.hasShift64HighRegBug())
1892 return false;
1894
1895 switch (MI->getOpcode()) {
1896 default:
1897 return false;
1898 case AMDGPU::V_LSHLREV_B64_e64:
1899 case AMDGPU::V_LSHRREV_B64_e64:
1900 case AMDGPU::V_ASHRREV_I64_e64:
1901 break;
1902 }
1903
1904 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1905 if (!Amt->isReg())
1906 return false;
1907
1908 Register AmtReg = Amt->getReg();
1909 const MachineRegisterInfo &MRI = MF.getRegInfo();
1910 // Check if this is a last VGPR in the allocation block.
1911 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1912 return false;
1913
1914 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1915 return false;
1916
1917 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1918 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1919 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1920 bool Overlapped = OverlappedSrc || OverlappedDst;
1921
1922 assert(!OverlappedDst || !OverlappedSrc ||
1923 Src1->getReg() == MI->getOperand(0).getReg());
1925 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1926
1927 Register NewReg;
1928 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1929 : AMDGPU::VGPR_32RegClass) {
1930 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1931 NewReg = Reg;
1932 break;
1933 }
1934 }
1935
1936 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1937 : NewReg;
1938 Register NewAmtLo;
1939
1940 if (Overlapped)
1941 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1942
1943 DebugLoc DL = MI->getDebugLoc();
1944 MachineBasicBlock *MBB = MI->getParent();
1945 // Insert a full wait count because found register might be pending a wait.
1946 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1947 .addImm(0);
1948
1949 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1950 if (Overlapped)
1951 runOnInstruction(
1952 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1953 .addDef(AmtReg - 1)
1954 .addReg(AmtReg - 1, RegState::Undef)
1955 .addReg(NewAmtLo, RegState::Undef));
1956 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1957 .addDef(AmtReg)
1958 .addReg(AmtReg, RegState::Undef)
1959 .addReg(NewAmt, RegState::Undef));
1960
1961 // Instructions emitted after the current instruction will be processed by the
1962 // parent loop of the hazard recognizer in a natural way.
1963 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1964 AmtReg)
1965 .addDef(NewAmt)
1966 .addReg(NewAmt)
1967 .addReg(AmtReg);
1968 if (Overlapped)
1969 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1970 AmtReg - 1)
1971 .addDef(NewAmtLo)
1972 .addReg(NewAmtLo)
1973 .addReg(AmtReg - 1);
1974
1975 // Re-running hazard recognizer on the modified instruction is not necessary,
1976 // inserted V_SWAP_B32 has already both read and write new registers so
1977 // hazards related to these register has already been handled.
1978 Amt->setReg(NewAmt);
1979 Amt->setIsKill(false);
1980 // We do not update liveness, so verifier may see it as undef.
1981 Amt->setIsUndef();
1982 if (OverlappedDst)
1983 MI->getOperand(0).setReg(NewReg);
1984 if (OverlappedSrc) {
1985 Src1->setReg(NewReg);
1986 Src1->setIsKill(false);
1987 Src1->setIsUndef();
1988 }
1989
1990 return true;
1991}
1992
1993int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1994 int NSAtoVMEMWaitStates = 1;
1995
1996 if (!ST.hasNSAtoVMEMBug())
1997 return 0;
1998
2000 return 0;
2001
2002 const SIInstrInfo *TII = ST.getInstrInfo();
2003 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2004 if (!Offset || (Offset->getImm() & 6) == 0)
2005 return 0;
2006
2007 auto IsHazardFn = [TII](const MachineInstr &I) {
2008 if (!SIInstrInfo::isMIMG(I))
2009 return false;
2010 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2011 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2012 TII->getInstSizeInBytes(I) >= 16;
2013 };
2014
2015 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2016}
2017
2018int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2019 int FPAtomicToDenormModeWaitStates = 3;
2020
2022 return 0;
2024
2025 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2026 return 0;
2027
2028 auto IsHazardFn = [](const MachineInstr &I) {
2030 return false;
2031 return SIInstrInfo::isFPAtomic(I);
2032 };
2033
2034 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2035 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2036 return true;
2037
2038 switch (MI.getOpcode()) {
2039 case AMDGPU::S_WAITCNT:
2040 case AMDGPU::S_WAITCNT_VSCNT:
2041 case AMDGPU::S_WAITCNT_VMCNT:
2042 case AMDGPU::S_WAITCNT_EXPCNT:
2043 case AMDGPU::S_WAITCNT_LGKMCNT:
2044 case AMDGPU::S_WAIT_IDLE:
2045 return true;
2046 default:
2047 break;
2048 }
2049
2050 return false;
2051 };
2052
2053 return FPAtomicToDenormModeWaitStates -
2054 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2055}
2056
2057int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2059
2060 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2061}
2062
2063int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2064 // Early exit if no padding is requested.
2065 if (MFMAPaddingRatio == 0)
2066 return 0;
2067
2069 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2070 return 0;
2071
2072 int NeighborMFMALatency = 0;
2073 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2074 this](const MachineInstr &MI) {
2075 if (!SIInstrInfo::isMFMA(MI))
2076 return false;
2077
2078 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2079 return true;
2080 };
2081
2082 const int MaxMFMAPipelineWaitStates = 16;
2083 int WaitStatesSinceNeighborMFMA =
2084 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2085
2086 int NeighborMFMAPaddingNeeded =
2087 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2088 WaitStatesSinceNeighborMFMA;
2089
2090 return std::max(0, NeighborMFMAPaddingNeeded);
2091}
2092
2093int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2094 int WaitStatesNeeded = 0;
2095 unsigned Opc = MI->getOpcode();
2096
2097 auto IsVALUFn = [](const MachineInstr &MI) {
2098 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2099 };
2100
2101 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2102 const int LegacyVALUWritesVGPRWaitStates = 2;
2103 const int VALUWritesExecWaitStates = 4;
2104 const int MaxWaitStates = 4;
2105
2106 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2107 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2109
2110 if (WaitStatesNeeded < MaxWaitStates) {
2111 for (const MachineOperand &Use : MI->explicit_uses()) {
2112 const int MaxWaitStates = 2;
2113
2114 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2115 continue;
2116
2117 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2118 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2119 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2120
2121 if (WaitStatesNeeded == MaxWaitStates)
2122 break;
2123 }
2124 }
2125 }
2126
2127 for (const MachineOperand &Op : MI->explicit_operands()) {
2128 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2129 continue;
2130
2131 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2132 continue;
2133
2134 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2135 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2136 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2137 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2138 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2139 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2140 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2141 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2142 const int MaxWaitStates = 18;
2143 Register Reg = Op.getReg();
2144 unsigned HazardDefLatency = 0;
2145
2146 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2147 this](const MachineInstr &MI) {
2148 if (!SIInstrInfo::isMFMA(MI))
2149 return false;
2150 Register DstReg = MI.getOperand(0).getReg();
2151 if (DstReg == Reg)
2152 return false;
2153 HazardDefLatency =
2154 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2155 return TRI.regsOverlap(DstReg, Reg);
2156 };
2157
2158 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2159 MaxWaitStates);
2160 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2161 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2162 int OpNo = Op.getOperandNo();
2163 if (OpNo == SrcCIdx) {
2164 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2165 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2166 switch (HazardDefLatency) {
2167 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2168 break;
2169 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2170 break;
2171 case 16: [[fallthrough]];
2172 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2173 break;
2174 }
2175 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2176 switch (HazardDefLatency) {
2177 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2178 break;
2179 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2180 break;
2181 case 16: [[fallthrough]];
2182 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2183 break;
2184 }
2185 }
2186
2187 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2189
2190 if (WaitStatesNeeded == MaxWaitStates)
2191 return WaitStatesNeeded; // Early exit.
2192
2193 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2194 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2195 return false;
2196 Register DstReg = MI.getOperand(0).getReg();
2197 return TRI.regsOverlap(Reg, DstReg);
2198 };
2199
2200 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2201 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2202 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2203 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2204 if (OpNo == SrcCIdx)
2205 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2206 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2207 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2208
2209 WaitStatesNeededForUse = NeedWaitStates -
2210 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2211 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2212
2213 if (WaitStatesNeeded == MaxWaitStates)
2214 return WaitStatesNeeded; // Early exit.
2215 }
2216
2217 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2218 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2219 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2220 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2221 const int MaxWaitStates = 13;
2222 Register DstReg = MI->getOperand(0).getReg();
2223 unsigned HazardDefLatency = 0;
2224
2225 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2226 this](const MachineInstr &MI) {
2227 if (!SIInstrInfo::isMFMA(MI))
2228 return false;
2229 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2230 HazardDefLatency =
2231 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2232 return TRI.regsOverlap(Reg, DstReg);
2233 };
2234
2235 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2236 int NeedWaitStates;
2237 switch (HazardDefLatency) {
2238 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2239 break;
2240 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2241 break;
2242 case 16: [[fallthrough]];
2243 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2244 break;
2245 }
2246
2247 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2248 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2249 }
2250
2251 // Pad neighboring MFMA with noops for better inter-wave performance.
2252 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2253
2254 return WaitStatesNeeded;
2255}
2256
2257static int
2259 bool IsGFX950) {
2260 // xdl def cycles | gfx940 | gfx950
2261 // 2 pass | 3 4
2262 // 4 pass | 5 6
2263 // 8 pass | 9 10
2264 // 16 pass | 17 18
2265 return NumPasses + 1 + IsGFX950;
2266}
2267
2268static int
2270 bool IsGFX950) {
2271 // xdl def cycles | gfx940 | gfx950
2272 // 2 pass | 3 3
2273 // 4 pass | 5 6
2274 // 8 pass | 9 10
2275 // 16 pass | 17 18
2276 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2277}
2278
2279static int
2281 // 2 pass -> 2
2282 // 4 pass -> 4
2283 // 8 pass -> 8
2284 // 16 pass -> 16
2285 return NumPasses;
2286}
2287
2288static int
2290 // 2 pass -> 4
2291 // 4 pass -> 6
2292 // 8 pass -> 10
2293 // 16 pass -> 18
2294 return NumPasses + 2;
2295}
2296
2298 // 2 pass -> 5
2299 // 4 pass -> 7
2300 // 8 pass -> 11
2301 // 16 pass -> 19
2302 return NumPasses + 3;
2303}
2304
2305int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2306 int WaitStatesNeeded = 0;
2307 unsigned Opc = MI->getOpcode();
2308
2309 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2311 };
2312
2313 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2316 };
2317
2318 if (!SIInstrInfo::isMFMA(*MI))
2319 return WaitStatesNeeded;
2320
2321 const int VALUWritesExecWaitStates = 4;
2322 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2323 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2324 VALUWritesExecWaitStates);
2325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2326
2327 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2328
2329 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2330 for (const MachineOperand &Use : MI->explicit_uses()) {
2331 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2332 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2333 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2334 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2335 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2336 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2337 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2338 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2339 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2340 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2341 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2342 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2343 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2344 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2345 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2346 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2347 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2348 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2349 const int MaxWaitStates = 19;
2350
2351 if (!Use.isReg())
2352 continue;
2353 Register Reg = Use.getReg();
2354 bool FullReg;
2355 const MachineInstr *MI1;
2356
2357 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2358 this](const MachineInstr &MI) {
2359 if (!SIInstrInfo::isMFMA(MI))
2360 return false;
2361 Register DstReg = MI.getOperand(0).getReg();
2362 FullReg = (DstReg == Reg);
2363 MI1 = &MI;
2364 return TRI.regsOverlap(DstReg, Reg);
2365 };
2366
2367 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2368 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2369 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2370
2371 int NumWaitStates =
2372 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2373 if (NumWaitStates == std::numeric_limits<int>::max())
2374 continue;
2375
2376 int OpNo = Use.getOperandNo();
2377 unsigned Opc1 = MI1->getOpcode();
2378 int NeedWaitStates = 0;
2379 if (OpNo == SrcCIdx) {
2380 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2381 NeedWaitStates = 0;
2382 } else if (FullReg) {
2383 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2384 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2385 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2386 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2387 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2388 else if (ST.hasGFX940Insts() &&
2389 TSchedModel.computeInstrLatency(MI1) == 2)
2390 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2391 } else {
2392 switch (Opc1) {
2393 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2394 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2395 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2396 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2397 if (!isXDL(ST, *MI))
2398 NeedWaitStates =
2399 ST.hasGFX950Insts()
2400 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2401 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2402 break;
2403 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2404 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2405 if (!isXDL(ST, *MI))
2406 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2407 break;
2408 default:
2409 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2410 if (ST.hasGFX940Insts()) {
2411 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2412 break;
2413
2414 NeedWaitStates =
2415 isXDL(ST, *MI1)
2416 ? (isXDL(ST, *MI)
2418 NumPasses, ST.hasGFX950Insts())
2420 NumPasses, ST.hasGFX950Insts()))
2422 NumPasses);
2423 break;
2424 }
2425
2426 switch (NumPasses) {
2427 case 2:
2428 NeedWaitStates =
2429 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2430 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2431 break;
2432 case 8:
2433 NeedWaitStates =
2434 isDGEMM(Opc)
2435 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2436 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2437 break;
2438 case 16:
2439 NeedWaitStates =
2440 isDGEMM(Opc)
2441 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2442 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2443 break;
2444 default:
2445 llvm_unreachable("unexpected number of passes");
2446 }
2447 }
2448 }
2449 } else {
2450 switch (Opc1) {
2451 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2452 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2453 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2454 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2455 NeedWaitStates =
2456 ST.hasGFX950Insts()
2457 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2458 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2459 break;
2460 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2461 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2462 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2463 break;
2464 default:
2465 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2466
2467 if (ST.hasGFX940Insts()) {
2468 NeedWaitStates =
2469 isXDL(ST, *MI1)
2471 NumPasses)
2473 NumPasses);
2474 break;
2475 }
2476
2477 switch (NumPasses) {
2478 case 2:
2479 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2480 break;
2481 case 4:
2482 llvm_unreachable("unexpected number of passes for mfma");
2483 case 8:
2484 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2485 break;
2486 case 16:
2487 default:
2488 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2489 }
2490 }
2491 }
2492 if (WaitStatesNeeded >= NeedWaitStates)
2493 continue;
2494
2495 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2496 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2497
2498 if (WaitStatesNeeded == MaxWaitStates)
2499 break;
2500 }
2501
2502 // Pad neighboring MFMA with noops for better inter-wave performance.
2503 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2504
2505 return WaitStatesNeeded;
2506}
2507
2508int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2509 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2510 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2511 return 0;
2512
2513 int WaitStatesNeeded = 0;
2514
2515 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2516 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2517 };
2518
2519 for (const MachineOperand &Op : MI->explicit_uses()) {
2520 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2521 continue;
2522
2523 Register Reg = Op.getReg();
2524
2525 const int AccVgprReadLdStWaitStates = 2;
2526 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2527 const int MaxWaitStates = 2;
2528
2529 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2530 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2532
2533 if (WaitStatesNeeded == MaxWaitStates)
2534 return WaitStatesNeeded; // Early exit.
2535
2536 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2537 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2538 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2539 return false;
2540 auto IsVALUFn = [](const MachineInstr &MI) {
2542 };
2543 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2544 std::numeric_limits<int>::max();
2545 };
2546
2547 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2548 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2549 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2550 }
2551
2552 return WaitStatesNeeded;
2553}
2554
2555int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2557 "this is a different vcmpx+permlane hazard");
2558 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2559 const SIInstrInfo *TII = ST.getInstrInfo();
2560
2561 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2562 return isVCmpXWritesExec(*TII, *TRI, MI);
2563 };
2564
2565 auto IsVALUFn = [](const MachineInstr &MI) {
2566 return SIInstrInfo::isVALU(MI);
2567 };
2568
2569 const int VCmpXWritesExecWaitStates = 4;
2570 const int VALUWritesVDstWaitStates = 2;
2571 int WaitStatesNeeded = 0;
2572
2573 for (const MachineOperand &Op : MI->explicit_uses()) {
2574 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2575 continue;
2576 Register Reg = Op.getReg();
2577
2578 int WaitStatesSinceDef =
2579 VALUWritesVDstWaitStates -
2580 getWaitStatesSinceDef(Reg, IsVALUFn,
2581 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2582 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2583 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2584 break;
2585 }
2586
2587 int VCmpXHazardWaits =
2588 VCmpXWritesExecWaitStates -
2589 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2590
2591 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2592 return WaitStatesNeeded;
2593}
2594
2596 // 2 pass -> 4
2597 // 4 pass -> 6
2598 // 8 pass -> 10
2599 // 16 pass -> 18
2600 return NumPasses + 2;
2601}
2602
2604 // 2 pass -> 5
2605 // 4 pass -> 7
2606 // 8 pass -> 11
2607 // 16 pass -> 19
2608 return NumPasses + 3;
2609}
2610
2612 // 2 pass -> 5
2613 // 4 pass -> 7
2614 // 8 pass -> 11
2615 // 16 pass -> 19
2616 return NumPasses + 3;
2617}
2618
2620 // 2 pass -> 4
2621 // 4 pass -> 6
2622 // 8 pass -> 10
2623 // 16 pass -> 18
2624 return NumPasses + 2;
2625}
2626
2627int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2628 if (!ST.hasGFX90AInsts())
2629 return 0;
2630
2631 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2632 return isDGEMM(MI.getOpcode());
2633 };
2634
2635 // This is checked in checkMAIHazards90A()
2636 if (SIInstrInfo::isMFMA(*MI))
2637 return 0;
2638
2639 const MachineRegisterInfo &MRI = MF.getRegInfo();
2640
2641 int WaitStatesNeeded = 0;
2642
2643 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2646 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2647 bool IsVALU = SIInstrInfo::isVALU(*MI);
2648
2649 const MachineInstr *MFMA = nullptr;
2650 unsigned Reg;
2651 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2652 if (!SIInstrInfo::isMFMA(MI) ||
2653 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2654 return false;
2655 MFMA = &MI;
2656 return true;
2657 };
2658
2659 const MachineInstr *DOT = nullptr;
2660 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2661 if (!SIInstrInfo::isDOT(MI) ||
2662 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2663 return false;
2664 DOT = &MI;
2665 return true;
2666 };
2667
2668 bool DGEMMAfterVALUWrite = false;
2669 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2670 // Found DGEMM on reverse traversal to def.
2671 if (isDGEMM(MI.getOpcode()))
2672 DGEMMAfterVALUWrite = true;
2673
2674 // Only hazard if register is defined by a VALU and a DGEMM is found after
2675 // after the def.
2676 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2677 return false;
2678
2679 return true;
2680 };
2681
2682 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2683 AMDGPU::OpName::src2);
2684
2685 if (IsMemOrExport || IsVALU) {
2686 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2687 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2688 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2689 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2690 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2691 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2692 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2693 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2694 const int DotWriteSameDotReadSrcAB = 3;
2695 const int DotWriteDifferentVALURead = 3;
2696 const int DMFMABetweenVALUWriteVMEMRead = 2;
2697 const int MaxWaitStates = 19;
2698
2699 for (const MachineOperand &Use : MI->explicit_uses()) {
2700 if (!Use.isReg())
2701 continue;
2702 Reg = Use.getReg();
2703
2704 DOT = nullptr;
2705 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2706 MaxWaitStates);
2707 if (DOT) {
2708 int NeedWaitStates = 0;
2709 if (DOT->getOpcode() == MI->getOpcode()) {
2710 if (&Use - &MI->getOperand(0) != SrcCIdx)
2711 NeedWaitStates = DotWriteSameDotReadSrcAB;
2712 } else {
2713 NeedWaitStates = DotWriteDifferentVALURead;
2714 }
2715
2716 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2717 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2718 }
2719
2720 // Workaround for HW data hazard bug observed only in GFX90A. When there
2721 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2722 // causes the SQ to incorrectly not insert two wait states between the two
2723 // instructions needed to avoid data hazard.
2724 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2725 DGEMMAfterVALUWrite = false;
2726 if (TRI.isVectorRegister(MRI, Reg)) {
2727 int WaitStatesNeededForUse =
2728 DMFMABetweenVALUWriteVMEMRead -
2729 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2730 DMFMABetweenVALUWriteVMEMRead);
2731
2732 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2733 }
2734 }
2735
2736 MFMA = nullptr;
2737 WaitStatesSinceDef =
2738 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2739 if (!MFMA)
2740 continue;
2741
2742 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2743 int NumPasses = HazardDefLatency;
2744 int NeedWaitStates = MaxWaitStates;
2745
2746 if (isDGEMM(MFMA->getOpcode())) {
2747 switch (HazardDefLatency) {
2748 case 4:
2749 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2750 : DMFMA4x4WriteVgprVALUReadWaitStates;
2751 break;
2752 case 8:
2753 case 16:
2754 NeedWaitStates =
2755 IsMemOrExport
2756 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2757 : (ST.hasGFX950Insts()
2758 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2759 : DMFMA16x16WriteVgprVALUReadWaitStates);
2760 break;
2761 default:
2762 llvm_unreachable("unexpected dgemm");
2763 }
2764 } else if (ST.hasGFX940Insts()) {
2765 NeedWaitStates =
2766 isXDL(ST, *MFMA)
2769 NumPasses);
2770 } else {
2771 switch (HazardDefLatency) {
2772 case 2:
2773 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2774 break;
2775 case 8:
2776 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2777 break;
2778 case 16:
2779 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2780 break;
2781 default:
2782 llvm_unreachable("unexpected number of passes for mfma");
2783 }
2784 }
2785
2786 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2787 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2788
2789 if (WaitStatesNeeded == MaxWaitStates)
2790 break;
2791 }
2792 }
2793
2794 unsigned Opc = MI->getOpcode();
2795 const int DMFMAToFMA64WaitStates = 2;
2796 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2797 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2798 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2799 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2800 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2801 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2802 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2803 }
2804
2805 if (!IsVALU && !IsMemOrExport)
2806 return WaitStatesNeeded;
2807
2808 for (const MachineOperand &Def : MI->defs()) {
2809 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2810 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2811 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2812 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2813 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2814 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2815 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2816 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2817 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2818 const int DotWriteDifferentVALUWrite = 3;
2819 const int MaxWaitStates = 19;
2820 const int MaxWarWaitStates = 15;
2821
2822 Reg = Def.getReg();
2823
2824 DOT = nullptr;
2825 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2826 MaxWaitStates);
2827 if (DOT && DOT->getOpcode() != MI->getOpcode())
2828 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2829 WaitStatesSinceDef);
2830
2831 MFMA = nullptr;
2832 WaitStatesSinceDef =
2833 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2834 if (MFMA) {
2835 int NeedWaitStates = MaxWaitStates;
2836 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2837
2838 if (isDGEMM(MFMA->getOpcode())) {
2839 switch (NumPasses) {
2840 case 4:
2841 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2842 break;
2843 case 8:
2844 case 16:
2845 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2846 break;
2847 default:
2848 llvm_unreachable("unexpected number of cycles for dgemm");
2849 }
2850 } else if (ST.hasGFX940Insts()) {
2851 NeedWaitStates =
2852 isXDL(ST, *MFMA)
2855 } else {
2856 switch (NumPasses) {
2857 case 2:
2858 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2859 break;
2860 case 8:
2861 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2862 break;
2863 case 16:
2864 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2865 break;
2866 default:
2867 llvm_unreachable("Unexpected number of passes for mfma");
2868 }
2869 }
2870
2871 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2872 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2873
2874 if (WaitStatesNeeded == MaxWaitStates)
2875 break;
2876 }
2877
2878 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2879 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2880 !MI.readsRegister(Reg, &TRI))
2881 return false;
2882
2883 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2884 return false;
2885
2886 const MachineOperand *SrcC =
2887 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2888 assert(SrcC);
2889 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2890 return false;
2891
2892 MFMA = &MI;
2893 return true;
2894 };
2895
2896 MFMA = nullptr;
2897 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2898 MaxWarWaitStates);
2899 if (!MFMA)
2900 continue;
2901
2902 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2903 int NeedWaitStates = MaxWaitStates;
2904 switch (HazardDefLatency) {
2905 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2906 break;
2907 case 4: assert(ST.hasGFX940Insts());
2908 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2909 break;
2910 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2911 break;
2912 case 16: [[fallthrough]];
2913 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2914 break;
2915 }
2916
2917 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2918 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2919 }
2920
2921 return WaitStatesNeeded;
2922}
2923
2925 if (!SU->isInstr())
2926 return false;
2927
2928 const MachineInstr *MAI = nullptr;
2929
2930 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2931 MAI = nullptr;
2933 MAI = &MI;
2934 return MAI != nullptr;
2935 };
2936
2937 MachineInstr *MI = SU->getInstr();
2938 if (IsMFMAFn(*MI)) {
2939 int W = getWaitStatesSince(IsMFMAFn, 16);
2940 if (MAI)
2941 return W < (int)TSchedModel.computeInstrLatency(MAI);
2942 }
2943
2944 return false;
2945}
2946
2947// Adjust global offsets for instructions bundled with S_GETPC_B64 after
2948// insertion of a new instruction.
2949static void updateGetPCBundle(MachineInstr *NewMI) {
2950 if (!NewMI->isBundled())
2951 return;
2952
2953 // Find start of bundle.
2954 auto I = NewMI->getIterator();
2955 while (I->isBundledWithPred())
2956 I--;
2957 if (I->isBundle())
2958 I++;
2959
2960 // Bail if this is not an S_GETPC bundle.
2961 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2962 return;
2963
2964 // Update offsets of any references in the bundle.
2965 const unsigned NewBytes = 4;
2966 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2967 "Unexpected instruction insertion in bundle");
2968 auto NextMI = std::next(NewMI->getIterator());
2969 auto End = NewMI->getParent()->end();
2970 while (NextMI != End && NextMI->isBundledWithPred()) {
2971 for (auto &Operand : NextMI->operands()) {
2972 if (Operand.isGlobal())
2973 Operand.setOffset(Operand.getOffset() + NewBytes);
2974 }
2975 NextMI++;
2976 }
2977}
2978
2979bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2980 if (!ST.hasVALUMaskWriteHazard())
2981 return false;
2983
2984 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2985 return false;
2986
2987 // The hazard sequence is three instructions:
2988 // 1. VALU reads SGPR as mask
2989 // 2. SALU writes SGPR
2990 // 3. SALU reads SGPR
2991 // The hazard can expire if the distance between 2 and 3 is sufficient.
2992 // In practice this happens <10% of the time, hence this always assumes
2993 // the hazard exists if 1 and 2 are present to avoid searching.
2994
2995 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2996 if (!SDSTOp || !SDSTOp->isReg())
2997 return false;
2998
2999 const Register HazardReg = SDSTOp->getReg();
3000 if (HazardReg == AMDGPU::EXEC ||
3001 HazardReg == AMDGPU::EXEC_LO ||
3002 HazardReg == AMDGPU::EXEC_HI ||
3003 HazardReg == AMDGPU::M0)
3004 return false;
3005
3006 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3007 switch (I.getOpcode()) {
3008 case AMDGPU::V_ADDC_U32_e32:
3009 case AMDGPU::V_ADDC_U32_dpp:
3010 case AMDGPU::V_CNDMASK_B16_e32:
3011 case AMDGPU::V_CNDMASK_B16_dpp:
3012 case AMDGPU::V_CNDMASK_B32_e32:
3013 case AMDGPU::V_CNDMASK_B32_dpp:
3014 case AMDGPU::V_DIV_FMAS_F32_e64:
3015 case AMDGPU::V_DIV_FMAS_F64_e64:
3016 case AMDGPU::V_SUBB_U32_e32:
3017 case AMDGPU::V_SUBB_U32_dpp:
3018 case AMDGPU::V_SUBBREV_U32_e32:
3019 case AMDGPU::V_SUBBREV_U32_dpp:
3020 // These implicitly read VCC as mask source.
3021 return HazardReg == AMDGPU::VCC ||
3022 HazardReg == AMDGPU::VCC_LO ||
3023 HazardReg == AMDGPU::VCC_HI;
3024 case AMDGPU::V_ADDC_U32_e64:
3025 case AMDGPU::V_ADDC_U32_e64_dpp:
3026 case AMDGPU::V_CNDMASK_B16_e64:
3027 case AMDGPU::V_CNDMASK_B16_e64_dpp:
3028 case AMDGPU::V_CNDMASK_B32_e64:
3029 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3030 case AMDGPU::V_SUBB_U32_e64:
3031 case AMDGPU::V_SUBB_U32_e64_dpp:
3032 case AMDGPU::V_SUBBREV_U32_e64:
3033 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3034 // Only check mask register overlaps.
3035 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3036 assert(SSRCOp);
3037 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3038 }
3039 default:
3040 return false;
3041 }
3042 };
3043
3044 const MachineRegisterInfo &MRI = MF.getRegInfo();
3045 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3046 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3047 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3048 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3049 return true;
3050
3051 // VALU access to any SGPR or literal constant other than HazardReg
3052 // mitigates hazard. No need to check HazardReg here as this will
3053 // only be called when !IsHazardFn.
3054 if (!SIInstrInfo::isVALU(I))
3055 return false;
3056 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3057 const MachineOperand &Op = I.getOperand(OpNo);
3058 if (Op.isReg()) {
3059 Register OpReg = Op.getReg();
3060 // Only consider uses
3061 if (!Op.isUse())
3062 continue;
3063 // Ignore EXEC
3064 if (OpReg == AMDGPU::EXEC ||
3065 OpReg == AMDGPU::EXEC_LO ||
3066 OpReg == AMDGPU::EXEC_HI)
3067 continue;
3068 // Ignore all implicit uses except VCC
3069 if (Op.isImplicit()) {
3070 if (OpReg == AMDGPU::VCC ||
3071 OpReg == AMDGPU::VCC_LO ||
3072 OpReg == AMDGPU::VCC_HI)
3073 return true;
3074 continue;
3075 }
3076 if (TRI.isSGPRReg(MRI, OpReg))
3077 return true;
3078 } else {
3079 const MCInstrDesc &InstDesc = I.getDesc();
3080 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3081 if (!TII.isInlineConstant(Op, OpInfo))
3082 return true;
3083 }
3084 }
3085 return false;
3086 };
3087
3088 // Check for hazard
3089 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3090 std::numeric_limits<int>::max())
3091 return false;
3092
3093 auto NextMI = std::next(MI->getIterator());
3094
3095 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3096 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3097 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3099
3100 // SALU write may be s_getpc in a bundle.
3101 updateGetPCBundle(NewMI);
3102
3103 return true;
3104}
3105
3106// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3107// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3108static std::optional<unsigned> sgprPairNumber(Register Reg,
3109 const SIRegisterInfo &TRI) {
3110 switch (Reg) {
3111 case AMDGPU::M0:
3112 case AMDGPU::EXEC:
3113 case AMDGPU::EXEC_LO:
3114 case AMDGPU::EXEC_HI:
3115 case AMDGPU::SGPR_NULL:
3116 case AMDGPU::SGPR_NULL64:
3117 return {};
3118 default:
3119 break;
3120 }
3121 unsigned RegN = TRI.getEncodingValue(Reg);
3122 if (RegN > 127)
3123 return {};
3124 return (RegN >> 1) & 0x3f;
3125}
3126
3127// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3128void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3129 assert(MMF == &MF);
3130
3131 // Assume non-empty vector means it has already been computed.
3132 if (!VALUReadHazardSGPRs.empty())
3133 return;
3134
3136 bool IsCallFree =
3137 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3138
3139 // Exhaustive search is only viable in non-caller/callee functions where
3140 // VALUs will be exposed to the hazard recognizer.
3141 UseVALUReadHazardExhaustiveSearch =
3142 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3144
3145 // Consider all SGPRs hazards if the shader uses function calls or is callee.
3146 bool UseVALUUseCache =
3147 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3148 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3149 if (!UseVALUUseCache)
3150 return;
3151
3152 // Perform a post ordered reverse scan to find VALUs which read an SGPR
3153 // before a SALU write to the same SGPR. This provides a reduction in
3154 // hazard insertion when all VALU access to an SGPR occurs after its last
3155 // SALU write, when compared to a linear scan.
3156 const MachineRegisterInfo &MRI = MF.getRegInfo();
3157 BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3159 CI.compute(*MMF);
3160
3161 for (auto *MBB : post_order(&MF)) {
3162 bool InCycle = CI.getCycle(MBB) != nullptr;
3163 for (auto &MI : reverse(MBB->instrs())) {
3164 bool IsVALU = SIInstrInfo::isVALU(MI);
3165 bool IsSALU = SIInstrInfo::isSALU(MI);
3166 if (!IsVALU && !IsSALU)
3167 continue;
3168
3169 for (const MachineOperand &Op : MI.operands()) {
3170 if (!Op.isReg())
3171 continue;
3172 Register Reg = Op.getReg();
3173 assert(!Op.getSubReg());
3174 // Only consider implicit operands of VCC.
3175 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3176 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3177 continue;
3178 if (!TRI.isSGPRReg(MRI, Reg))
3179 continue;
3180 auto RegN = sgprPairNumber(Reg, TRI);
3181 if (!RegN)
3182 continue;
3183 if (IsVALU && Op.isUse()) {
3184 // Note: any access within a cycle must be considered a hazard.
3185 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3186 VALUReadHazardSGPRs.set(*RegN);
3187 ReadSGPRs.set(*RegN);
3188 } else if (IsSALU) {
3189 if (Op.isDef())
3190 SALUWriteSGPRs.set(*RegN);
3191 else
3192 ReadSGPRs.set(*RegN);
3193 }
3194 }
3195 }
3196 }
3197}
3198
3199bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3200 if (!ST.hasVALUReadSGPRHazard())
3201 return false;
3202
3203 // The hazard sequence is fundamentally three instructions:
3204 // 1. VALU reads SGPR
3205 // 2. SALU writes SGPR
3206 // 3. VALU/SALU reads SGPR
3207 // Try to avoid searching for (1) because the expiry point of the hazard is
3208 // indeterminate; however, the hazard between (2) and (3) can expire if the
3209 // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3210 // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3211 // even if individual SGPRs are accessed.
3212
3213 bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3214 bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3215 if (!(MIIsSALU || MIIsVALU))
3216 return false;
3217
3218 // Avoid expensive search when compile time is priority by
3219 // mitigating every SALU which writes an SGPR.
3222 return false;
3223
3224 const MachineOperand *SDSTOp =
3225 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3226 if (!SDSTOp || !SDSTOp->isReg())
3227 return false;
3228
3229 const Register HazardReg = SDSTOp->getReg();
3230 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3231 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3232 return false;
3233
3234 // Add s_wait_alu sa_sdst(0) after SALU write.
3235 auto NextMI = std::next(MI->getIterator());
3236 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3237 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3239
3240 // SALU write may be s_getpc in a bundle.
3241 updateGetPCBundle(NewMI);
3242
3243 return true;
3244 }
3245
3246 // Pre-compute set of SGPR pairs read by VALUs.
3247 // Note: pass mutable pointer to MachineFunction for CycleInfo.
3248 computeVALUHazardSGPRs(MI->getMF());
3249
3250 // If no VALUs hazard SGPRs exist then nothing to do.
3251 if (VALUReadHazardSGPRs.none())
3252 return false;
3253
3254 // All SGPR writes before a call/return must be flushed as the callee/caller
3255 // will not will not see the hazard chain, i.e. (2) to (3) described above.
3256 const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3257 !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3258 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3259
3260 // Collect all SGPR sources for MI which are read by a VALU.
3261 const MachineRegisterInfo &MRI = MF.getRegInfo();
3262 SmallSet<Register, 4> SGPRsUsed;
3263
3264 if (!IsSetPC) {
3265 for (const MachineOperand &Op : MI->all_uses()) {
3266 Register OpReg = Op.getReg();
3267
3268 // Only consider VCC implicit uses on VALUs.
3269 // The only expected SALU implicit access is SCC which is no hazard.
3270 if (MIIsSALU && Op.isImplicit())
3271 continue;
3272
3273 if (!TRI.isSGPRReg(MRI, OpReg))
3274 continue;
3275
3276 auto RegN = sgprPairNumber(OpReg, TRI);
3277 if (!RegN)
3278 continue;
3279
3280 if (!VALUReadHazardSGPRs[*RegN])
3281 continue;
3282
3283 SGPRsUsed.insert(OpReg);
3284 }
3285
3286 // No SGPRs -> nothing to do.
3287 if (SGPRsUsed.empty())
3288 return false;
3289 }
3290
3291 // A hazard is any SALU which writes one of the SGPRs read by MI.
3292 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3293 if (!SIInstrInfo::isSALU(I))
3294 return false;
3295 // Ensure SGPR flush before call/return by conservatively assuming every
3296 // SALU writes an SGPR.
3297 if (IsSetPC && I.getNumDefs() > 0)
3298 return true;
3299 // Check for any register writes.
3300 return any_of(SGPRsUsed, [this, &I](Register Reg) {
3301 return I.modifiesRegister(Reg, &TRI);
3302 });
3303 };
3304
3305 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3306 auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3307 if (Count >= SALUExpiryCount)
3308 return true;
3309 // s_wait_alu sa_sdst(0) on path mitigates hazard.
3310 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3311 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3312 return true;
3313 return false;
3314 };
3315
3316 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3317 // Only count true SALUs as wait states.
3319 return 0;
3320 // SALU must be unrelated to any hazard registers.
3321 if (any_of(SGPRsUsed,
3322 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3323 return 0;
3324 return 1;
3325 };
3326
3327 // Check for the hazard.
3329 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3330 std::next(MI->getReverseIterator()), 0,
3331 IsExpiredFn, Visited, WaitStatesFn);
3332
3333 if (WaitStates >= SALUExpiryCount)
3334 return false;
3335
3336 // Validate hazard through an exhaustive search.
3337 if (UseVALUReadHazardExhaustiveSearch) {
3338 // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3339 // This is searching for (1) in the hazard description.
3340 auto hazardPair = [this](Register Reg) {
3341 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3342 return Register(AMDGPU::VCC);
3343 auto RegN = sgprPairNumber(Reg, TRI);
3344 return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3345 };
3346 auto SearchHazardFn = [this, hazardPair,
3347 &SGPRsUsed](const MachineInstr &I) {
3348 if (!SIInstrInfo::isVALU(I))
3349 return false;
3350 // Check for any register reads.
3351 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3352 return I.readsRegister(hazardPair(Reg), &TRI);
3353 });
3354 };
3355 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3356 return false;
3357 };
3358 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3359 std::numeric_limits<int>::max())
3360 return false;
3361 }
3362
3363 // Add s_wait_alu sa_sdst(0) before SALU read.
3364 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3365 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3367
3368 // SALU read may be after s_getpc in a bundle.
3369 updateGetPCBundle(NewMI);
3370
3371 return true;
3372}
3373
3374static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3375 const SIInstrInfo &TII) {
3376 MachineBasicBlock &EntryMBB = MF->front();
3377 if (EntryMBB.begin() != EntryMBB.end()) {
3378 auto &EntryMI = *EntryMBB.begin();
3379 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3380 EntryMI.getOperand(0).getImm() >= Priority)
3381 return false;
3382 }
3383
3384 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3385 .addImm(Priority);
3386 return true;
3387}
3388
3389bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3390 if (!ST.hasRequiredExportPriority())
3391 return false;
3392
3393 // Assume the following shader types will never have exports,
3394 // and avoid adding or adjusting S_SETPRIO.
3395 MachineBasicBlock *MBB = MI->getParent();
3396 MachineFunction *MF = MBB->getParent();
3397 auto CC = MF->getFunction().getCallingConv();
3398 switch (CC) {
3403 return false;
3404 default:
3405 break;
3406 }
3407
3408 const int MaxPriority = 3;
3409 const int NormalPriority = 2;
3410 const int PostExportPriority = 0;
3411
3412 auto It = MI->getIterator();
3413 switch (MI->getOpcode()) {
3414 case AMDGPU::S_ENDPGM:
3415 case AMDGPU::S_ENDPGM_SAVED:
3416 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3417 case AMDGPU::SI_RETURN_TO_EPILOG:
3418 // Ensure shader with calls raises priority at entry.
3419 // This ensures correct priority if exports exist in callee.
3420 if (MF->getFrameInfo().hasCalls())
3421 return ensureEntrySetPrio(MF, NormalPriority, TII);
3422 return false;
3423 case AMDGPU::S_SETPRIO: {
3424 // Raise minimum priority unless in workaround.
3425 auto &PrioOp = MI->getOperand(0);
3426 int Prio = PrioOp.getImm();
3427 bool InWA = (Prio == PostExportPriority) &&
3428 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3429 if (InWA || Prio >= NormalPriority)
3430 return false;
3431 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3432 return true;
3433 }
3434 default:
3435 if (!TII.isEXP(*MI))
3436 return false;
3437 break;
3438 }
3439
3440 // Check entry priority at each export (as there will only be a few).
3441 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3442 bool Changed = false;
3444 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3445
3446 auto NextMI = std::next(It);
3447 bool EndOfShader = false;
3448 if (NextMI != MBB->end()) {
3449 // Only need WA at end of sequence of exports.
3450 if (TII.isEXP(*NextMI))
3451 return Changed;
3452 // Assume appropriate S_SETPRIO after export means WA already applied.
3453 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3454 NextMI->getOperand(0).getImm() == PostExportPriority)
3455 return Changed;
3456 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3457 }
3458
3459 const DebugLoc &DL = MI->getDebugLoc();
3460
3461 // Lower priority.
3462 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3463 .addImm(PostExportPriority);
3464
3465 if (!EndOfShader) {
3466 // Wait for exports to complete.
3467 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3468 .addReg(AMDGPU::SGPR_NULL)
3469 .addImm(0);
3470 }
3471
3472 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3473 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3474
3475 if (!EndOfShader) {
3476 // Return to normal (higher) priority.
3477 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3478 .addImm(NormalPriority);
3479 }
3480
3481 return true;
3482}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static std::optional< unsigned > sgprPairNumber(Register Reg, const SIRegisterInfo &TRI)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > MaxExhaustiveHazardSearch("amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, cl::desc("Maximum function size for exhausive hazard search"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:950
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:519
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasVALUReadSGPRHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:619
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:504
bool hasRFEHazards() const
Definition: GCNSubtarget.h:514
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:510
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
void compute(FunctionT &F)
Compute the cycle info for a function.
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
unsigned getInstructionCount() const
Return the number of MachineInstrs in this MachineFunction.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
Definition: MachineInstr.h:472
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:798
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:438
static bool isSOPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:478
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:550
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:542
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:655
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:414
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:518
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:790
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:811
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:827
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:839
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:774
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:534
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:624
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:766
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:806
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:922
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:815
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:422
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
bool empty() const
Definition: SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:470
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ Entry
Definition: COFF.h:844
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< po_iterator< T > > post_order(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:130
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:71
Definition: regcomp.c:192