LLVM 18.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
20
21using namespace llvm;
22
23namespace {
24
25struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(0, Value))
30 return O.error("'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37};
38
39} // end anonymous namespace
40
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46//===----------------------------------------------------------------------===//
47// Hazard Recognizer Implementation
48//===----------------------------------------------------------------------===//
49
51 const GCNSubtarget &ST);
52
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65}
66
68 EmittedInstrs.clear();
69}
70
73}
74
76 CurrCycleInstr = MI;
77}
78
79static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81}
82
83static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85}
86
87static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96}
97
98static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100}
101
102static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104}
105
106static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116}
117
118static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opcode);
120}
121
122static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opcode);
135}
136
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167}
168
169static bool isLdsDma(const MachineInstr &MI) {
170 return SIInstrInfo::isVALU(MI) &&
172}
173
174static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
175 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
176 AMDGPU::OpName::simm16);
177 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
178}
179
182 MachineInstr *MI = SU->getInstr();
183 // If we are not in "HazardRecognizerMode" and therefore not being run from
184 // the scheduler, track possible stalls from hazards but don't insert noops.
185 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186
187 if (MI->isBundle())
188 return NoHazard;
189
190 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191 return HazardType;
192
193 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194 return HazardType;
195
196 if (checkFPAtomicToDenormModeHazard(MI) > 0)
197 return HazardType;
198
199 if (ST.hasNoDataDepHazard())
200 return NoHazard;
201
202 // FIXME: Should flat be considered vmem?
203 if ((SIInstrInfo::isVMEM(*MI) ||
205 && checkVMEMHazards(MI) > 0)
206 return HazardType;
207
208 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212 return HazardType;
213
214 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215 return HazardType;
216
217 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218 return HazardType;
219
222 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223 return HazardType;
224
225 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226 return HazardType;
227
228 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229 return HazardType;
230
231 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232 return HazardType;
233
234 if (((ST.hasReadM0MovRelInterpHazard() &&
235 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
236 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
237 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
239 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
241 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
242 checkReadM0Hazards(MI) > 0)
243 return HazardType;
244
245 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
246 return HazardType;
247
248 if ((SIInstrInfo::isVMEM(*MI) ||
250 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
251 return HazardType;
252
253 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
254 return HazardType;
255
256 return NoHazard;
257}
258
260 unsigned Quantity) {
261 while (Quantity > 0) {
262 unsigned Arg = std::min(Quantity, 8u);
263 Quantity -= Arg;
264 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
265 .addImm(Arg - 1);
266 }
267}
268
269unsigned
270GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
271 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
272 assert(TSchedModel.getWriteProcResBegin(SC) !=
273 TSchedModel.getWriteProcResEnd(SC));
274 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
275}
276
277void GCNHazardRecognizer::processBundle() {
278 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
280 // Check bundled MachineInstr's for hazards.
281 for (; MI != E && MI->isInsideBundle(); ++MI) {
282 CurrCycleInstr = &*MI;
283 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
284
285 if (IsHazardRecognizerMode) {
286 fixHazards(CurrCycleInstr);
287
288 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
289 }
290
291 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
292 // include the bundled MI directly after, only add a maximum of
293 // (MaxLookAhead - 1) noops to EmittedInstrs.
294 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
295 EmittedInstrs.push_front(nullptr);
296
297 EmittedInstrs.push_front(CurrCycleInstr);
298 EmittedInstrs.resize(MaxLookAhead);
299 }
300 CurrCycleInstr = nullptr;
301}
302
303void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
304 assert(IsHazardRecognizerMode);
305
306 unsigned NumPreNoops = PreEmitNoops(MI);
307 EmitNoops(NumPreNoops);
308 if (MI->isInsideBundle())
309 insertNoopsInBundle(MI, TII, NumPreNoops);
310 else
311 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
312 NumPreNoops);
314 AdvanceCycle();
315}
316
318 IsHazardRecognizerMode = true;
319 CurrCycleInstr = MI;
320 unsigned W = PreEmitNoopsCommon(MI);
321 fixHazards(MI);
322 CurrCycleInstr = nullptr;
323 return W;
324}
325
327 if (MI->isBundle())
328 return 0;
329
330 int WaitStates = 0;
331
333 return std::max(WaitStates, checkSMRDHazards(MI));
334
335 if (ST.hasNSAtoVMEMBug())
336 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
337
338 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
339
340 if (ST.hasNoDataDepHazard())
341 return WaitStates;
342
344 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
345
347 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
348
350 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
351
352 if (isDivFMas(MI->getOpcode()))
353 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
354
355 if (isRWLane(MI->getOpcode()))
356 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
357
360 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
361 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
362
363 if (MI->isInlineAsm())
364 return std::max(WaitStates, checkInlineAsmHazards(MI));
365
366 if (isSGetReg(MI->getOpcode()))
367 return std::max(WaitStates, checkGetRegHazards(MI));
368
369 if (isSSetReg(MI->getOpcode()))
370 return std::max(WaitStates, checkSetRegHazards(MI));
371
372 if (isRFE(MI->getOpcode()))
373 return std::max(WaitStates, checkRFEHazards(MI));
374
375 if ((ST.hasReadM0MovRelInterpHazard() &&
376 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
377 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
378 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
380 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
381 (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
382 return std::max(WaitStates, checkReadM0Hazards(MI));
383
385 return std::max(WaitStates, checkMAIHazards(MI));
386
387 if (SIInstrInfo::isVMEM(*MI) ||
390 return std::max(WaitStates, checkMAILdStHazards(MI));
391
392 return WaitStates;
393}
394
396 EmittedInstrs.push_front(nullptr);
397}
398
400 // When the scheduler detects a stall, it will call AdvanceCycle() without
401 // emitting any instructions.
402 if (!CurrCycleInstr) {
403 EmittedInstrs.push_front(nullptr);
404 return;
405 }
406
407 if (CurrCycleInstr->isBundle()) {
408 processBundle();
409 return;
410 }
411
412 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
413 if (!NumWaitStates) {
414 CurrCycleInstr = nullptr;
415 return;
416 }
417
418 // Keep track of emitted instructions
419 EmittedInstrs.push_front(CurrCycleInstr);
420
421 // Add a nullptr for each additional wait state after the first. Make sure
422 // not to add more than getMaxLookAhead() items to the list, since we
423 // truncate the list to that size right after this loop.
424 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
425 i < e; ++i) {
426 EmittedInstrs.push_front(nullptr);
427 }
428
429 // getMaxLookahead() is the largest number of wait states we will ever need
430 // to insert, so there is no point in keeping track of more than that many
431 // wait states.
432 EmittedInstrs.resize(getMaxLookAhead());
433
434 CurrCycleInstr = nullptr;
435}
436
438 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
439}
440
441//===----------------------------------------------------------------------===//
442// Helper Functions
443//===----------------------------------------------------------------------===//
444
446
447typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
448typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
449
450// Search for a hazard in a block and its predecessors.
451template <typename StateT>
452static bool
453hasHazard(StateT State,
454 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
455 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
456 const MachineBasicBlock *MBB,
459 for (auto E = MBB->instr_rend(); I != E; ++I) {
460 // No need to look at parent BUNDLE instructions.
461 if (I->isBundle())
462 continue;
463
464 switch (IsHazard(State, *I)) {
465 case HazardFound:
466 return true;
467 case HazardExpired:
468 return false;
469 default:
470 // Continue search
471 break;
472 }
473
474 if (I->isInlineAsm() || I->isMetaInstruction())
475 continue;
476
477 UpdateState(State, *I);
478 }
479
480 for (MachineBasicBlock *Pred : MBB->predecessors()) {
481 if (!Visited.insert(Pred).second)
482 continue;
483
484 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
485 Visited))
486 return true;
487 }
488
489 return false;
490}
491
492// Returns a minimum wait states since \p I walking all predecessors.
493// Only scans until \p IsExpired does not return true.
494// Can only be run in a hazard recognizer mode.
500 for (auto E = MBB->instr_rend(); I != E; ++I) {
501 // Don't add WaitStates for parent BUNDLE instructions.
502 if (I->isBundle())
503 continue;
504
505 if (IsHazard(*I))
506 return WaitStates;
507
508 if (I->isInlineAsm())
509 continue;
510
511 WaitStates += GetNumWaitStates(*I);
512
513 if (IsExpired(*I, WaitStates))
514 return std::numeric_limits<int>::max();
515 }
516
517 int MinWaitStates = std::numeric_limits<int>::max();
518 for (MachineBasicBlock *Pred : MBB->predecessors()) {
519 if (!Visited.insert(Pred).second)
520 continue;
521
522 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
523 IsExpired, Visited, GetNumWaitStates);
524
525 MinWaitStates = std::min(MinWaitStates, W);
526 }
527
528 return MinWaitStates;
529}
530
532 const MachineInstr *MI, IsExpiredFn IsExpired) {
534 return getWaitStatesSince(IsHazard, MI->getParent(),
535 std::next(MI->getReverseIterator()),
536 0, IsExpired, Visited);
537}
538
539int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
540 if (IsHazardRecognizerMode) {
541 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
542 return WaitStates >= Limit;
543 };
544 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
545 }
546
547 int WaitStates = 0;
548 for (MachineInstr *MI : EmittedInstrs) {
549 if (MI) {
550 if (IsHazard(*MI))
551 return WaitStates;
552
553 if (MI->isInlineAsm())
554 continue;
555 }
556 ++WaitStates;
557
558 if (WaitStates >= Limit)
559 break;
560 }
561 return std::numeric_limits<int>::max();
562}
563
564int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
565 IsHazardFn IsHazardDef,
566 int Limit) {
567 const SIRegisterInfo *TRI = ST.getRegisterInfo();
568
569 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
570 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
571 };
572
573 return getWaitStatesSince(IsHazardFn, Limit);
574}
575
576int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
577 int Limit) {
578 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
579 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
580 };
581
582 return getWaitStatesSince(IsHazardFn, Limit);
583}
584
585//===----------------------------------------------------------------------===//
586// No-op Hazard Detection
587//===----------------------------------------------------------------------===//
588
589static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
590 MCRegister Reg) {
591 for (MCRegUnit Unit : TRI.regunits(Reg))
592 BV.set(Unit);
593}
594
595static void addRegsToSet(const SIRegisterInfo &TRI,
597 BitVector &DefSet, BitVector &UseSet) {
598 for (const MachineOperand &Op : Ops) {
599 if (Op.isReg())
600 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
601 }
602}
603
604void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
605 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
606}
607
609 return !SIInstrInfo::isSMRD(*MI);
610}
611
614}
615
616int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
617 // SMEM soft clause are only present on VI+, and only matter if xnack is
618 // enabled.
619 if (!ST.isXNACKEnabled())
620 return 0;
621
622 bool IsSMRD = TII.isSMRD(*MEM);
623
624 resetClause();
625
626 // A soft-clause is any group of consecutive SMEM instructions. The
627 // instructions in this group may return out of order and/or may be
628 // replayed (i.e. the same instruction issued more than once).
629 //
630 // In order to handle these situations correctly we need to make sure that
631 // when a clause has more than one instruction, no instruction in the clause
632 // writes to a register that is read by another instruction in the clause
633 // (including itself). If we encounter this situation, we need to break the
634 // clause by inserting a non SMEM instruction.
635
636 for (MachineInstr *MI : EmittedInstrs) {
637 // When we hit a non-SMEM instruction then we have passed the start of the
638 // clause and we can stop.
639 if (!MI)
640 break;
641
643 break;
644
645 addClauseInst(*MI);
646 }
647
648 if (ClauseDefs.none())
649 return 0;
650
651 // We need to make sure not to put loads and stores in the same clause if they
652 // use the same address. For now, just start a new clause whenever we see a
653 // store.
654 if (MEM->mayStore())
655 return 1;
656
657 addClauseInst(*MEM);
658
659 // If the set of defs and uses intersect then we cannot add this instruction
660 // to the clause, so we have a hazard.
661 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
662}
663
664int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
665 int WaitStatesNeeded = 0;
666
667 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
668
669 // This SMRD hazard only affects SI.
670 if (!ST.hasSMRDReadVALUDefHazard())
671 return WaitStatesNeeded;
672
673 // A read of an SGPR by SMRD instruction requires 4 wait states when the
674 // SGPR was written by a VALU instruction.
675 int SmrdSgprWaitStates = 4;
676 auto IsHazardDefFn = [this](const MachineInstr &MI) {
677 return TII.isVALU(MI);
678 };
679 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
680 return TII.isSALU(MI);
681 };
682
683 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
684
685 for (const MachineOperand &Use : SMRD->uses()) {
686 if (!Use.isReg())
687 continue;
688 int WaitStatesNeededForUse =
689 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
690 SmrdSgprWaitStates);
691 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
692
693 // This fixes what appears to be undocumented hardware behavior in SI where
694 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
695 // needs some number of nops in between. We don't know how many we need, but
696 // let's use 4. This wasn't discovered before probably because the only
697 // case when this happens is when we expand a 64-bit pointer into a full
698 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
699 // probably never encountered in the closed-source land.
700 if (IsBufferSMRD) {
701 int WaitStatesNeededForUse =
702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
703 IsBufferHazardDefFn,
704 SmrdSgprWaitStates);
705 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
706 }
707 }
708
709 return WaitStatesNeeded;
710}
711
712int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
714 return 0;
715
716 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
717
718 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
719 // SGPR was written by a VALU Instruction.
720 const int VmemSgprWaitStates = 5;
721 auto IsHazardDefFn = [this](const MachineInstr &MI) {
722 return TII.isVALU(MI);
723 };
724 for (const MachineOperand &Use : VMEM->uses()) {
725 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
726 continue;
727
728 int WaitStatesNeededForUse =
729 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
730 VmemSgprWaitStates);
731 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
732 }
733 return WaitStatesNeeded;
734}
735
736int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
737 const SIRegisterInfo *TRI = ST.getRegisterInfo();
738 const SIInstrInfo *TII = ST.getInstrInfo();
739
740 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
741 int DppVgprWaitStates = 2;
742 int DppExecWaitStates = 5;
743 int WaitStatesNeeded = 0;
744 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
745 return TII->isVALU(MI);
746 };
747
748 for (const MachineOperand &Use : DPP->uses()) {
749 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
750 continue;
751 int WaitStatesNeededForUse =
752 DppVgprWaitStates - getWaitStatesSinceDef(
753 Use.getReg(),
754 [](const MachineInstr &) { return true; },
755 DppVgprWaitStates);
756 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
757 }
758
759 WaitStatesNeeded = std::max(
760 WaitStatesNeeded,
761 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
762 DppExecWaitStates));
763
764 return WaitStatesNeeded;
765}
766
767int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
768 const SIInstrInfo *TII = ST.getInstrInfo();
769
770 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
771 // instruction.
772 const int DivFMasWaitStates = 4;
773 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
774 return TII->isVALU(MI);
775 };
776 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
777 DivFMasWaitStates);
778
779 return DivFMasWaitStates - WaitStatesNeeded;
780}
781
782int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
783 const SIInstrInfo *TII = ST.getInstrInfo();
784 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
785
786 const int GetRegWaitStates = 2;
787 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
788 return GetRegHWReg == getHWReg(TII, MI);
789 };
790 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
791
792 return GetRegWaitStates - WaitStatesNeeded;
793}
794
795int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
796 const SIInstrInfo *TII = ST.getInstrInfo();
797 unsigned HWReg = getHWReg(TII, *SetRegInstr);
798
799 const int SetRegWaitStates = ST.getSetRegWaitStates();
800 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
801 return HWReg == getHWReg(TII, MI);
802 };
803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
804 return SetRegWaitStates - WaitStatesNeeded;
805}
806
807int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
808 if (!MI.mayStore())
809 return -1;
810
811 const SIInstrInfo *TII = ST.getInstrInfo();
812 unsigned Opcode = MI.getOpcode();
813 const MCInstrDesc &Desc = MI.getDesc();
814
815 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
816 int VDataRCID = -1;
817 if (VDataIdx != -1)
818 VDataRCID = Desc.operands()[VDataIdx].RegClass;
819
820 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
821 // There is no hazard if the instruction does not use vector regs
822 // (like wbinvl1)
823 if (VDataIdx == -1)
824 return -1;
825 // For MUBUF/MTBUF instructions this hazard only exists if the
826 // instruction is not using a register in the soffset field.
827 const MachineOperand *SOffset =
828 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
829 // If we have no soffset operand, then assume this field has been
830 // hardcoded to zero.
831 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
832 (!SOffset || !SOffset->isReg()))
833 return VDataIdx;
834 }
835
836 // MIMG instructions create a hazard if they don't use a 256-bit T# and
837 // the store size is greater than 8 bytes and they have more than two bits
838 // of their dmask set.
839 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
840 if (TII->isMIMG(MI)) {
841 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
842 assert(SRsrcIdx != -1 &&
843 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
844 (void)SRsrcIdx;
845 }
846
847 if (TII->isFLAT(MI)) {
848 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
849 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
850 return DataIdx;
851 }
852
853 return -1;
854}
855
856int
857GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
858 const MachineRegisterInfo &MRI) {
859 // Helper to check for the hazard where VMEM instructions that store more than
860 // 8 bytes can have there store data over written by the next instruction.
861 const SIRegisterInfo *TRI = ST.getRegisterInfo();
862
863 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
864 int WaitStatesNeeded = 0;
865
866 if (!TRI->isVectorRegister(MRI, Def.getReg()))
867 return WaitStatesNeeded;
868 Register Reg = Def.getReg();
869 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
870 int DataIdx = createsVALUHazard(MI);
871 return DataIdx >= 0 &&
872 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
873 };
874 int WaitStatesNeededForDef =
875 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
876 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
877
878 return WaitStatesNeeded;
879}
880
881int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
882 int WaitStatesNeeded = 0;
883
885 const int TransDefWaitstates = 1;
886
887 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
889 return false;
890 const SIRegisterInfo *TRI = ST.getRegisterInfo();
891 const SIInstrInfo *TII = ST.getInstrInfo();
892 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
893
894 for (const MachineOperand &Use : VALU->explicit_uses()) {
895 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
896 return true;
897 }
898
899 return false;
900 };
901
902 int WaitStatesNeededForDef =
903 TransDefWaitstates -
904 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
905 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
906 }
907
908 if (ST.hasDstSelForwardingHazard()) {
909 const int Shift16DefWaitstates = 1;
910
911 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
913 return false;
914 const SIInstrInfo *TII = ST.getInstrInfo();
915 if (SIInstrInfo::isSDWA(MI)) {
916 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
917 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
918 return false;
919 } else {
920 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
921 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
922 ->getImm() &
924 return false;
925 }
926 const SIRegisterInfo *TRI = ST.getRegisterInfo();
927 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
928 Register Def = Dst->getReg();
929
930 for (const MachineOperand &Use : VALU->explicit_uses()) {
931 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
932 return true;
933 }
934 }
935
936 return false;
937 };
938
939 int WaitStatesNeededForDef =
940 Shift16DefWaitstates -
941 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
942 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
943 }
944
945 if (ST.hasVDecCoExecHazard()) {
946 const int VALUWriteSGPRVALUReadWaitstates = 2;
947 const int VALUWriteEXECRWLane = 4;
948 const int VALUWriteVGPRReadlaneRead = 1;
949
950 const SIRegisterInfo *TRI = ST.getRegisterInfo();
951 const MachineRegisterInfo &MRI = MF.getRegInfo();
953 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
955 return false;
956 return MI.modifiesRegister(UseReg, TRI);
957 };
958
959 for (const MachineOperand &Use : VALU->explicit_uses()) {
960 if (!Use.isReg())
961 continue;
962
963 UseReg = Use.getReg();
964 if (TRI->isSGPRReg(MRI, UseReg)) {
965 int WaitStatesNeededForDef =
966 VALUWriteSGPRVALUReadWaitstates -
967 getWaitStatesSince(IsVALUDefSGPRFn,
968 VALUWriteSGPRVALUReadWaitstates);
969 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
970 }
971 }
972
973 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
974 UseReg = AMDGPU::VCC;
975 int WaitStatesNeededForDef =
976 VALUWriteSGPRVALUReadWaitstates -
977 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
978 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
979 }
980
981 switch (VALU->getOpcode()) {
982 case AMDGPU::V_READLANE_B32:
983 case AMDGPU::V_READFIRSTLANE_B32: {
984 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
985 UseReg = Src->getReg();
986 int WaitStatesNeededForDef =
987 VALUWriteVGPRReadlaneRead -
988 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
989 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
990 }
991 [[fallthrough]];
992 case AMDGPU::V_WRITELANE_B32: {
993 UseReg = AMDGPU::EXEC;
994 int WaitStatesNeededForDef =
995 VALUWriteEXECRWLane -
996 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
997 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
998 break;
999 }
1000 default:
1001 break;
1002 }
1003 }
1004
1005 // This checks for the hazard where VMEM instructions that store more than
1006 // 8 bytes can have there store data over written by the next instruction.
1007 if (!ST.has12DWordStoreHazard())
1008 return WaitStatesNeeded;
1009
1010 const MachineRegisterInfo &MRI = MF.getRegInfo();
1011
1012 for (const MachineOperand &Def : VALU->defs()) {
1013 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1014 }
1015
1016 return WaitStatesNeeded;
1017}
1018
1019int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1020 // This checks for hazards associated with inline asm statements.
1021 // Since inline asms can contain just about anything, we use this
1022 // to call/leverage other check*Hazard routines. Note that
1023 // this function doesn't attempt to address all possible inline asm
1024 // hazards (good luck), but is a collection of what has been
1025 // problematic thus far.
1026
1027 // see checkVALUHazards()
1028 if (!ST.has12DWordStoreHazard())
1029 return 0;
1030
1031 const MachineRegisterInfo &MRI = MF.getRegInfo();
1032 int WaitStatesNeeded = 0;
1033
1034 for (const MachineOperand &Op :
1036 if (Op.isReg() && Op.isDef()) {
1037 WaitStatesNeeded =
1038 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1039 }
1040 }
1041
1042 return WaitStatesNeeded;
1043}
1044
1045int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1046 const SIInstrInfo *TII = ST.getInstrInfo();
1047 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1048 const MachineRegisterInfo &MRI = MF.getRegInfo();
1049
1050 const MachineOperand *LaneSelectOp =
1051 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1052
1053 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1054 return 0;
1055
1056 Register LaneSelectReg = LaneSelectOp->getReg();
1057 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1058
1059 const int RWLaneWaitStates = 4;
1060 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1061 RWLaneWaitStates);
1062 return RWLaneWaitStates - WaitStatesSince;
1063}
1064
1065int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1066 if (!ST.hasRFEHazards())
1067 return 0;
1068
1069 const SIInstrInfo *TII = ST.getInstrInfo();
1070
1071 const int RFEWaitStates = 1;
1072
1073 auto IsHazardFn = [TII](const MachineInstr &MI) {
1074 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1075 };
1076 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1077 return RFEWaitStates - WaitStatesNeeded;
1078}
1079
1080int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1081 const SIInstrInfo *TII = ST.getInstrInfo();
1082 const int ReadM0WaitStates = 1;
1083 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1084 return ReadM0WaitStates -
1085 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1086}
1087
1088void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1089 fixVMEMtoScalarWriteHazards(MI);
1090 fixVcmpxPermlaneHazards(MI);
1091 fixSMEMtoVectorWriteHazards(MI);
1092 fixVcmpxExecWARHazard(MI);
1093 fixLdsBranchVmemWARHazard(MI);
1094 if (ST.hasLdsDirect()) {
1095 fixLdsDirectVALUHazard(MI);
1096 fixLdsDirectVMEMHazard(MI);
1097 }
1098 fixVALUPartialForwardingHazard(MI);
1099 fixVALUTransUseHazard(MI);
1100 fixWMMAHazards(MI);
1101 fixShift64HighRegBug(MI);
1102 fixVALUMaskWriteHazard(MI);
1103}
1104
1105bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1106 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1107 return false;
1108
1109 const SIInstrInfo *TII = ST.getInstrInfo();
1110 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1111 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1112 return (TII->isVOPC(MI) ||
1113 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1114 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1115 };
1116
1117 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1118 unsigned Opc = MI.getOpcode();
1119 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1120 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1121 };
1122
1123 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1124 std::numeric_limits<int>::max())
1125 return false;
1126
1127 // V_NOP will be discarded by SQ.
1128 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1129 // which is always a VGPR and available.
1130 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1131 Register Reg = Src0->getReg();
1132 bool IsUndef = Src0->isUndef();
1133 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1134 TII->get(AMDGPU::V_MOV_B32_e32))
1135 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1136 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1137
1138 return true;
1139}
1140
1141bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1143 return false;
1144
1146 return false;
1147
1148 if (MI->getNumDefs() == 0)
1149 return false;
1150
1151 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1152
1153 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1156 return false;
1157
1158 for (const MachineOperand &Def : MI->defs()) {
1159 const MachineOperand *Op =
1160 I.findRegisterUseOperand(Def.getReg(), false, TRI);
1161 if (!Op)
1162 continue;
1163 return true;
1164 }
1165 return false;
1166 };
1167
1168 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1169 return SIInstrInfo::isVALU(MI) ||
1170 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1171 !MI.getOperand(0).getImm()) ||
1172 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1173 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1174 };
1175
1176 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1177 std::numeric_limits<int>::max())
1178 return false;
1179
1180 const SIInstrInfo *TII = ST.getInstrInfo();
1181 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1182 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1184 return true;
1185}
1186
1187bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1189 return false;
1190
1191 if (!SIInstrInfo::isVALU(*MI))
1192 return false;
1193
1194 unsigned SDSTName;
1195 switch (MI->getOpcode()) {
1196 case AMDGPU::V_READLANE_B32:
1197 case AMDGPU::V_READFIRSTLANE_B32:
1198 SDSTName = AMDGPU::OpName::vdst;
1199 break;
1200 default:
1201 SDSTName = AMDGPU::OpName::sdst;
1202 break;
1203 }
1204
1205 const SIInstrInfo *TII = ST.getInstrInfo();
1206 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1207 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1208 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1209 if (!SDST) {
1210 for (const auto &MO : MI->implicit_operands()) {
1211 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1212 SDST = &MO;
1213 break;
1214 }
1215 }
1216 }
1217
1218 if (!SDST)
1219 return false;
1220
1221 const Register SDSTReg = SDST->getReg();
1222 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1223 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1224 };
1225
1226 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1227 if (TII->isSALU(MI)) {
1228 switch (MI.getOpcode()) {
1229 case AMDGPU::S_SETVSKIP:
1230 case AMDGPU::S_VERSION:
1231 case AMDGPU::S_WAITCNT_VSCNT:
1232 case AMDGPU::S_WAITCNT_VMCNT:
1233 case AMDGPU::S_WAITCNT_EXPCNT:
1234 // These instructions cannot not mitigate the hazard.
1235 return false;
1236 case AMDGPU::S_WAITCNT_LGKMCNT:
1237 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1238 return (MI.getOperand(1).getImm() == 0) &&
1239 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1240 case AMDGPU::S_WAITCNT: {
1241 const int64_t Imm = MI.getOperand(0).getImm();
1243 return (Decoded.LgkmCnt == 0);
1244 }
1245 default:
1246 // SOPP instructions cannot mitigate the hazard.
1247 if (TII->isSOPP(MI))
1248 return false;
1249 // At this point the SALU can be assumed to mitigate the hazard
1250 // because either:
1251 // (a) it is independent of the at risk SMEM (breaking chain),
1252 // or
1253 // (b) it is dependent on the SMEM, in which case an appropriate
1254 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1255 // SMEM instruction.
1256 return true;
1257 }
1258 }
1259 return false;
1260 };
1261
1262 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1263 std::numeric_limits<int>::max())
1264 return false;
1265
1266 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1267 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1268 .addImm(0);
1269 return true;
1270}
1271
1272bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1274 return false;
1275
1276 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1277 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1278 return false;
1279
1280 auto IsHazardFn = [TRI](const MachineInstr &I) {
1282 return false;
1283 return I.readsRegister(AMDGPU::EXEC, TRI);
1284 };
1285
1286 const SIInstrInfo *TII = ST.getInstrInfo();
1287 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1288 if (SIInstrInfo::isVALU(MI)) {
1289 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1290 return true;
1291 for (auto MO : MI.implicit_operands())
1292 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1293 return true;
1294 }
1295 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1296 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1297 return true;
1298 return false;
1299 };
1300
1301 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1302 std::numeric_limits<int>::max())
1303 return false;
1304
1305 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1306 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1308 return true;
1309}
1310
1312 const GCNSubtarget &ST) {
1313 if (!ST.hasLdsBranchVmemWARHazard())
1314 return false;
1315
1316 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1317 // instructions need to appear in the same function.
1318 bool HasLds = false;
1319 bool HasVmem = false;
1320 for (auto &MBB : MF) {
1321 for (auto &MI : MBB) {
1322 HasLds |= SIInstrInfo::isDS(MI);
1323 HasVmem |=
1325 if (HasLds && HasVmem)
1326 return true;
1327 }
1328 }
1329 return false;
1330}
1331
1333 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1334 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1335 !I.getOperand(1).getImm();
1336}
1337
1338bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1339 if (!RunLdsBranchVmemWARHazardFixup)
1340 return false;
1341
1343
1344 auto IsHazardInst = [](const MachineInstr &MI) {
1345 if (SIInstrInfo::isDS(MI))
1346 return 1;
1348 return 2;
1349 return 0;
1350 };
1351
1352 auto InstType = IsHazardInst(*MI);
1353 if (!InstType)
1354 return false;
1355
1356 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1357 return IsHazardInst(I) || isStoreCountWaitZero(I);
1358 };
1359
1360 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1361 if (!I.isBranch())
1362 return false;
1363
1364 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1365 auto InstType2 = IsHazardInst(I);
1366 return InstType2 && InstType != InstType2;
1367 };
1368
1369 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1370 auto InstType2 = IsHazardInst(I);
1371 if (InstType == InstType2)
1372 return true;
1373
1374 return isStoreCountWaitZero(I);
1375 };
1376
1377 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1378 std::numeric_limits<int>::max();
1379 };
1380
1381 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1382 std::numeric_limits<int>::max())
1383 return false;
1384
1385 const SIInstrInfo *TII = ST.getInstrInfo();
1386 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1387 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1388 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1389 .addImm(0);
1390
1391 return true;
1392}
1393
1394bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1396 return false;
1397
1398 const int NoHazardWaitStates = 15;
1399 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1400 const Register VDSTReg = VDST->getReg();
1401
1402 bool VisitedTrans = false;
1403 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1404 if (!SIInstrInfo::isVALU(I))
1405 return false;
1406 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1407 // Cover both WAR and WAW
1408 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1409 };
1410 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1411 if (WaitStates >= NoHazardWaitStates)
1412 return true;
1413 // Instructions which cause va_vdst==0 expire hazard
1416 };
1417 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1418 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1419 };
1420
1422 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1423 std::next(MI->getReverseIterator()), 0,
1424 IsExpiredFn, Visited, GetWaitStatesFn);
1425
1426 // Transcendentals can execute in parallel to other VALUs.
1427 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1428 if (VisitedTrans)
1429 Count = 0;
1430
1431 MachineOperand *WaitVdstOp =
1432 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1433 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1434
1435 return true;
1436}
1437
1438bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1440 return false;
1441
1442 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1443 const Register VDSTReg = VDST->getReg();
1444
1445 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1448 return false;
1449 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1450 };
1451 auto IsExpiredFn = [](const MachineInstr &I, int) {
1453 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1454 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1455 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0);
1456 };
1457
1458 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1459 std::numeric_limits<int>::max())
1460 return false;
1461
1462 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1463 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1465
1466 return true;
1467}
1468
1469bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1470 if (!ST.isWave64())
1471 return false;
1473 return false;
1474 if (!SIInstrInfo::isVALU(*MI))
1475 return false;
1476
1478
1479 for (const MachineOperand &Use : MI->explicit_uses()) {
1480 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1481 SrcVGPRs.insert(Use.getReg());
1482 }
1483
1484 // Only applies with >= 2 unique VGPR sources
1485 if (SrcVGPRs.size() <= 1)
1486 return false;
1487
1488 // Look for the following pattern:
1489 // Va <- VALU [PreExecPos]
1490 // intv1
1491 // Exec <- SALU [ExecPos]
1492 // intv2
1493 // Vb <- VALU [PostExecPos]
1494 // intv3
1495 // MI Va, Vb (WaitState = 0)
1496 //
1497 // Where:
1498 // intv1 + intv2 <= 2 VALUs
1499 // intv3 <= 4 VALUs
1500 //
1501 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1502
1503 const int Intv1plus2MaxVALUs = 2;
1504 const int Intv3MaxVALUs = 4;
1505 const int IntvMaxVALUs = 6;
1506 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1507
1508 struct StateType {
1510 int ExecPos = std::numeric_limits<int>::max();
1511 int VALUs = 0;
1512 };
1513
1514 StateType State;
1515
1516 // This overloads expiry testing with all the hazard detection
1517 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1518 // Too many VALU states have passed
1519 if (State.VALUs > NoHazardVALUWaitStates)
1520 return HazardExpired;
1521
1522 // Instructions which cause va_vdst==0 expire hazard
1525 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1526 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1527 return HazardExpired;
1528
1529 // Track registers writes
1530 bool Changed = false;
1531 if (SIInstrInfo::isVALU(I)) {
1532 for (Register Src : SrcVGPRs) {
1533 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1534 State.DefPos[Src] = State.VALUs;
1535 Changed = true;
1536 }
1537 }
1538 } else if (SIInstrInfo::isSALU(I)) {
1539 if (State.ExecPos == std::numeric_limits<int>::max()) {
1540 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1541 State.ExecPos = State.VALUs;
1542 Changed = true;
1543 }
1544 }
1545 }
1546
1547 // Early expiration: too many VALUs in intv3
1548 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1549 return HazardExpired;
1550
1551 // Only evaluate state if something changed
1552 if (!Changed)
1553 return NoHazardFound;
1554
1555 // Determine positions of VALUs pre/post exec change
1556 if (State.ExecPos == std::numeric_limits<int>::max())
1557 return NoHazardFound;
1558
1559 int PreExecPos = std::numeric_limits<int>::max();
1560 int PostExecPos = std::numeric_limits<int>::max();
1561
1562 for (auto Entry : State.DefPos) {
1563 int DefVALUs = Entry.second;
1564 if (DefVALUs != std::numeric_limits<int>::max()) {
1565 if (DefVALUs >= State.ExecPos)
1566 PreExecPos = std::min(PreExecPos, DefVALUs);
1567 else if (DefVALUs < State.ExecPos)
1568 PostExecPos = std::min(PostExecPos, DefVALUs);
1569 }
1570 }
1571
1572 // Need a VALUs post exec change
1573 if (PostExecPos == std::numeric_limits<int>::max())
1574 return NoHazardFound;
1575
1576 // Too many VALUs in intv3?
1577 int Intv3VALUs = PostExecPos;
1578 if (Intv3VALUs > Intv3MaxVALUs)
1579 return HazardExpired;
1580
1581 // Too many VALUs in intv2?
1582 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1583 if (Intv2VALUs > Intv1plus2MaxVALUs)
1584 return HazardExpired;
1585
1586 // Need a VALUs pre exec change
1587 if (PreExecPos == std::numeric_limits<int>::max())
1588 return NoHazardFound;
1589
1590 // Too many VALUs in intv1?
1591 int Intv1VALUs = PreExecPos - State.ExecPos;
1592 if (Intv1VALUs > Intv1plus2MaxVALUs)
1593 return HazardExpired;
1594
1595 // Too many VALUs in intv1 + intv2
1596 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1597 return HazardExpired;
1598
1599 return HazardFound;
1600 };
1601 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1603 State.VALUs += 1;
1604 };
1605
1607 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1608 std::next(MI->getReverseIterator()), Visited))
1609 return false;
1610
1611 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1612 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1613 .addImm(0x0fff);
1614
1615 return true;
1616}
1617
1618bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1619 if (!ST.hasVALUTransUseHazard())
1620 return false;
1621 if (!SIInstrInfo::isVALU(*MI))
1622 return false;
1623
1624 SmallSet<Register, 4> SrcVGPRs;
1625
1626 for (const MachineOperand &Use : MI->explicit_uses()) {
1627 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1628 SrcVGPRs.insert(Use.getReg());
1629 }
1630
1631 // Look for the following pattern:
1632 // Va <- TRANS VALU
1633 // intv
1634 // MI Va (WaitState = 0)
1635 //
1636 // Where:
1637 // intv <= 5 VALUs / 1 TRANS
1638 //
1639 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1640
1641 const int IntvMaxVALUs = 5;
1642 const int IntvMaxTRANS = 1;
1643
1644 struct StateType {
1645 int VALUs = 0;
1646 int TRANS = 0;
1647 };
1648
1649 StateType State;
1650
1651 // This overloads expiry testing with all the hazard detection
1652 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1653 // Too many VALU states have passed
1654 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1655 return HazardExpired;
1656
1657 // Instructions which cause va_vdst==0 expire hazard
1660 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1661 I.getOperand(0).getImm() == 0x0fff))
1662 return HazardExpired;
1663
1664 // Track registers writes
1665 if (SIInstrInfo::isTRANS(I)) {
1666 for (Register Src : SrcVGPRs) {
1667 if (I.modifiesRegister(Src, &TRI)) {
1668 return HazardFound;
1669 }
1670 }
1671 }
1672
1673 return NoHazardFound;
1674 };
1675 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1677 State.VALUs += 1;
1679 State.TRANS += 1;
1680 };
1681
1683 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1684 std::next(MI->getReverseIterator()), Visited))
1685 return false;
1686
1687 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1688 // avoided.
1689 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1690 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1692
1693 return true;
1694}
1695
1696bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1697 if (!SIInstrInfo::isWMMA(*MI))
1698 return false;
1699
1700 const SIInstrInfo *TII = ST.getInstrInfo();
1701 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1702
1703 auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1704 if (!SIInstrInfo::isWMMA(I))
1705 return false;
1706
1707 // Src0 or Src1 of the current wmma instruction overlaps with the dest of
1708 // the previous wmma.
1709 const Register CurSrc0Reg =
1710 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1711 const Register CurSrc1Reg =
1712 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1713
1714 const Register PrevDstReg =
1715 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1716
1717 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1718 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1719 return true;
1720 }
1721
1722 // Src2 of the current wmma instruction overlaps with the dest of the
1723 // previous wmma.
1724 const MachineOperand *Src2 =
1725 TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1726 const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
1727
1728 if (CurSrc2Reg != AMDGPU::NoRegister &&
1729 TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1730
1731 const MachineOperand *Src2Mods =
1732 TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
1733 const bool NoSrc2Mods =
1734 (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
1735 // Exception: there is no hazard if the wmma instructions are of the same
1736 // type and there is no input modifier on src2 of the current instruction.
1737 return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
1738 TII->pseudoToMCOpcode(MI->getOpcode())));
1739 }
1740
1741 return false;
1742 };
1743
1744 auto IsExpiredFn = [](const MachineInstr &I, int) {
1745 return SIInstrInfo::isVALU(I);
1746 };
1747
1748 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1749 std::numeric_limits<int>::max())
1750 return false;
1751
1752 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1753
1754 return true;
1755}
1756
1757bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1758 if (!ST.hasShift64HighRegBug())
1759 return false;
1760
1761 switch (MI->getOpcode()) {
1762 default:
1763 return false;
1764 case AMDGPU::V_LSHLREV_B64_e64:
1765 case AMDGPU::V_LSHRREV_B64_e64:
1766 case AMDGPU::V_ASHRREV_I64_e64:
1767 break;
1768 }
1769
1770 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1771 if (!Amt->isReg())
1772 return false;
1773
1774 Register AmtReg = Amt->getReg();
1775 const MachineRegisterInfo &MRI = MF.getRegInfo();
1776 // Check if this is a last VGPR in the allocation block.
1777 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1778 return false;
1779
1780 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1781 return false;
1782
1783 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1784 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1785 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1786 bool Overlapped = OverlappedSrc || OverlappedDst;
1787
1788 assert(!OverlappedDst || !OverlappedSrc ||
1789 Src1->getReg() == MI->getOperand(0).getReg());
1791 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1792
1793 Register NewReg;
1794 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1795 : AMDGPU::VGPR_32RegClass) {
1796 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1797 NewReg = Reg;
1798 break;
1799 }
1800 }
1801
1802 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1803 : NewReg;
1804 Register NewAmtLo;
1805
1806 if (Overlapped)
1807 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1808
1809 DebugLoc DL = MI->getDebugLoc();
1810 MachineBasicBlock *MBB = MI->getParent();
1811 // Insert a full wait count because found register might be pending a wait.
1812 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1813 .addImm(0);
1814
1815 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1816 if (Overlapped)
1817 runOnInstruction(
1818 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1819 .addDef(AmtReg - 1)
1820 .addReg(AmtReg - 1, RegState::Undef)
1821 .addReg(NewAmtLo, RegState::Undef));
1822 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1823 .addDef(AmtReg)
1824 .addReg(AmtReg, RegState::Undef)
1825 .addReg(NewAmt, RegState::Undef));
1826
1827 // Instructions emitted after the current instruction will be processed by the
1828 // parent loop of the hazard recognizer in a natural way.
1829 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1830 AmtReg)
1831 .addDef(NewAmt)
1832 .addReg(NewAmt)
1833 .addReg(AmtReg);
1834 if (Overlapped)
1835 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1836 AmtReg - 1)
1837 .addDef(NewAmtLo)
1838 .addReg(NewAmtLo)
1839 .addReg(AmtReg - 1);
1840
1841 // Re-running hazard recognizer on the modified instruction is not necessary,
1842 // inserted V_SWAP_B32 has already both read and write new registers so
1843 // hazards related to these register has already been handled.
1844 Amt->setReg(NewAmt);
1845 Amt->setIsKill(false);
1846 // We do not update liveness, so verifier may see it as undef.
1847 Amt->setIsUndef();
1848 if (OverlappedDst)
1849 MI->getOperand(0).setReg(NewReg);
1850 if (OverlappedSrc) {
1851 Src1->setReg(NewReg);
1852 Src1->setIsKill(false);
1853 Src1->setIsUndef();
1854 }
1855
1856 return true;
1857}
1858
1859int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1860 int NSAtoVMEMWaitStates = 1;
1861
1862 if (!ST.hasNSAtoVMEMBug())
1863 return 0;
1864
1866 return 0;
1867
1868 const SIInstrInfo *TII = ST.getInstrInfo();
1869 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1870 if (!Offset || (Offset->getImm() & 6) == 0)
1871 return 0;
1872
1873 auto IsHazardFn = [TII](const MachineInstr &I) {
1874 if (!SIInstrInfo::isMIMG(I))
1875 return false;
1876 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1877 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1878 TII->getInstSizeInBytes(I) >= 16;
1879 };
1880
1881 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1882}
1883
1884int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1885 int FPAtomicToDenormModeWaitStates = 3;
1886
1888 return 0;
1889
1890 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1891 return 0;
1892
1893 auto IsHazardFn = [](const MachineInstr &I) {
1895 return false;
1896 return SIInstrInfo::isFPAtomic(I);
1897 };
1898
1899 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1900 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1901 return true;
1902
1903 switch (MI.getOpcode()) {
1904 case AMDGPU::S_WAITCNT:
1905 case AMDGPU::S_WAITCNT_VSCNT:
1906 case AMDGPU::S_WAITCNT_VMCNT:
1907 case AMDGPU::S_WAITCNT_EXPCNT:
1908 case AMDGPU::S_WAITCNT_LGKMCNT:
1909 case AMDGPU::S_WAIT_IDLE:
1910 return true;
1911 default:
1912 break;
1913 }
1914
1915 return false;
1916 };
1917
1918 return FPAtomicToDenormModeWaitStates -
1919 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1920}
1921
1922int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1924
1925 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1926}
1927
1928int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1929 // Early exit if no padding is requested.
1930 if (MFMAPaddingRatio == 0)
1931 return 0;
1932
1934 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1935 return 0;
1936
1937 int NeighborMFMALatency = 0;
1938 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1939 this](const MachineInstr &MI) {
1940 if (!SIInstrInfo::isMFMA(MI))
1941 return false;
1942
1943 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1944 return true;
1945 };
1946
1947 const int MaxMFMAPipelineWaitStates = 16;
1948 int WaitStatesSinceNeighborMFMA =
1949 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1950
1951 int NeighborMFMAPaddingNeeded =
1952 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1953 WaitStatesSinceNeighborMFMA;
1954
1955 return std::max(0, NeighborMFMAPaddingNeeded);
1956}
1957
1958int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1959 int WaitStatesNeeded = 0;
1960 unsigned Opc = MI->getOpcode();
1961
1962 auto IsVALUFn = [](const MachineInstr &MI) {
1963 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1964 };
1965
1966 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1967 const int LegacyVALUWritesVGPRWaitStates = 2;
1968 const int VALUWritesExecWaitStates = 4;
1969 const int MaxWaitStates = 4;
1970
1971 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1972 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1973 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1974
1975 if (WaitStatesNeeded < MaxWaitStates) {
1976 for (const MachineOperand &Use : MI->explicit_uses()) {
1977 const int MaxWaitStates = 2;
1978
1979 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1980 continue;
1981
1982 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1983 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1984 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1985
1986 if (WaitStatesNeeded == MaxWaitStates)
1987 break;
1988 }
1989 }
1990 }
1991
1992 for (const MachineOperand &Op : MI->explicit_operands()) {
1993 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1994 continue;
1995
1996 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1997 continue;
1998
1999 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2000 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2001 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2002 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2003 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2004 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2005 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2006 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2007 const int MaxWaitStates = 18;
2008 Register Reg = Op.getReg();
2009 unsigned HazardDefLatency = 0;
2010
2011 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2012 this](const MachineInstr &MI) {
2013 if (!SIInstrInfo::isMFMA(MI))
2014 return false;
2015 Register DstReg = MI.getOperand(0).getReg();
2016 if (DstReg == Reg)
2017 return false;
2018 HazardDefLatency =
2019 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2020 return TRI.regsOverlap(DstReg, Reg);
2021 };
2022
2023 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2024 MaxWaitStates);
2025 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2026 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2027 int OpNo = Op.getOperandNo();
2028 if (OpNo == SrcCIdx) {
2029 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2030 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2031 switch (HazardDefLatency) {
2032 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2033 break;
2034 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2035 break;
2036 case 16: [[fallthrough]];
2037 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2038 break;
2039 }
2040 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2041 switch (HazardDefLatency) {
2042 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2043 break;
2044 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2045 break;
2046 case 16: [[fallthrough]];
2047 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2048 break;
2049 }
2050 }
2051
2052 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2054
2055 if (WaitStatesNeeded == MaxWaitStates)
2056 return WaitStatesNeeded; // Early exit.
2057
2058 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2059 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2060 return false;
2061 Register DstReg = MI.getOperand(0).getReg();
2062 return TRI.regsOverlap(Reg, DstReg);
2063 };
2064
2065 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2066 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2067 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2068 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2069 if (OpNo == SrcCIdx)
2070 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2071 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2072 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2073
2074 WaitStatesNeededForUse = NeedWaitStates -
2075 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2076 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2077
2078 if (WaitStatesNeeded == MaxWaitStates)
2079 return WaitStatesNeeded; // Early exit.
2080 }
2081
2082 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2083 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2084 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2085 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2086 const int MaxWaitStates = 13;
2087 Register DstReg = MI->getOperand(0).getReg();
2088 unsigned HazardDefLatency = 0;
2089
2090 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2091 this](const MachineInstr &MI) {
2092 if (!SIInstrInfo::isMFMA(MI))
2093 return false;
2094 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2095 HazardDefLatency =
2096 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2097 return TRI.regsOverlap(Reg, DstReg);
2098 };
2099
2100 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2101 int NeedWaitStates;
2102 switch (HazardDefLatency) {
2103 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2104 break;
2105 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2106 break;
2107 case 16: [[fallthrough]];
2108 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2109 break;
2110 }
2111
2112 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2113 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2114 }
2115
2116 // Pad neighboring MFMA with noops for better inter-wave performance.
2117 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2118
2119 return WaitStatesNeeded;
2120}
2121
2122int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2123 int WaitStatesNeeded = 0;
2124 unsigned Opc = MI->getOpcode();
2125
2126 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2128 };
2129
2130 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2133 };
2134
2135 if (!SIInstrInfo::isMFMA(*MI))
2136 return WaitStatesNeeded;
2137
2138 const int VALUWritesExecWaitStates = 4;
2139 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2140 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2141 VALUWritesExecWaitStates);
2142 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2143
2144 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2145
2146 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2147 for (const MachineOperand &Use : MI->explicit_uses()) {
2148 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2149 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2150 const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2151 const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2152 const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2153 const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2154 const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2155 const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2156 const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2157 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2158 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2159 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2160 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2161 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2162 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2163 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2164 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2165 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2166 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2167 const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2168 const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2169 const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2170 const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2171 const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2172 const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2173 const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2174 const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2175 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2176 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2177 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2178 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2179 const int MaxWaitStates = 19;
2180
2181 if (!Use.isReg())
2182 continue;
2183 Register Reg = Use.getReg();
2184 bool FullReg;
2185 const MachineInstr *MI1;
2186
2187 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2188 this](const MachineInstr &MI) {
2189 if (!SIInstrInfo::isMFMA(MI))
2190 return false;
2191 Register DstReg = MI.getOperand(0).getReg();
2192 FullReg = (DstReg == Reg);
2193 MI1 = &MI;
2194 return TRI.regsOverlap(DstReg, Reg);
2195 };
2196
2197 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2198 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2199 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2200
2201 int NumWaitStates =
2202 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2203 if (NumWaitStates == std::numeric_limits<int>::max())
2204 continue;
2205
2206 int OpNo = Use.getOperandNo();
2207 unsigned Opc1 = MI1->getOpcode();
2208 int NeedWaitStates = 0;
2209 if (OpNo == SrcCIdx) {
2210 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2211 NeedWaitStates = 0;
2212 } else if (FullReg) {
2213 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2214 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2215 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2216 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2217 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2218 else if (ST.hasGFX940Insts() &&
2219 TSchedModel.computeInstrLatency(MI1) == 2)
2220 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2221 } else {
2222 switch (Opc1) {
2223 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2224 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2225 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2226 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2227 if (!isXDL(ST, *MI))
2228 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2229 break;
2230 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2231 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2232 if (!isXDL(ST, *MI))
2233 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2234 break;
2235 default:
2236 if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2237 break;
2238 switch (TSchedModel.computeInstrLatency(MI1)) {
2239 case 2:
2240 NeedWaitStates = ST.hasGFX940Insts()
2241 ? isXDL(ST, *MI1)
2242 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2243 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2244 : isDGEMM(Opc)
2245 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2246 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2247 break;
2248 case 4:
2249 assert(ST.hasGFX940Insts());
2250 NeedWaitStates = isXDL(ST, *MI1)
2251 ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2252 : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2253 break;
2254 case 8:
2255 NeedWaitStates = ST.hasGFX940Insts()
2256 ? isXDL(ST, *MI1)
2257 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2258 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2259 : isDGEMM(Opc)
2260 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2261 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2262 break;
2263 case 16: [[fallthrough]];
2264 default:
2265 NeedWaitStates = ST.hasGFX940Insts()
2266 ? isXDL(ST, *MI1)
2267 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2268 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2269 : isDGEMM(Opc)
2270 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2271 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2272 }
2273 }
2274 }
2275 } else {
2276 switch (Opc1) {
2277 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2278 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2279 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2280 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2281 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2282 break;
2283 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2284 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2285 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2286 break;
2287 default:
2288 switch (TSchedModel.computeInstrLatency(MI1)) {
2289 case 2:
2290 NeedWaitStates = ST.hasGFX940Insts()
2291 ? isXDL(ST, *MI1)
2292 ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2293 : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2294 : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2295 break;
2296 case 4:
2297 assert(ST.hasGFX940Insts());
2298 NeedWaitStates = isXDL(ST, *MI1)
2299 ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2300 : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2301 break;
2302 case 8:
2303 NeedWaitStates = ST.hasGFX940Insts()
2304 ? isXDL(ST, *MI1)
2305 ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2306 : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2307 : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2308 break;
2309 case 16: [[fallthrough]];
2310 default:
2311 NeedWaitStates = ST.hasGFX940Insts()
2312 ? isXDL(ST, *MI1)
2313 ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2314 : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2315 : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2316 }
2317 }
2318 }
2319 if (WaitStatesNeeded >= NeedWaitStates)
2320 continue;
2321
2322 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2323 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2324
2325 if (WaitStatesNeeded == MaxWaitStates)
2326 break;
2327 }
2328
2329 return WaitStatesNeeded;
2330}
2331
2332int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2333 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2334 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2335 return 0;
2336
2337 int WaitStatesNeeded = 0;
2338
2339 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2340 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2341 };
2342
2343 for (const MachineOperand &Op : MI->explicit_uses()) {
2344 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2345 continue;
2346
2347 Register Reg = Op.getReg();
2348
2349 const int AccVgprReadLdStWaitStates = 2;
2350 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2351 const int MaxWaitStates = 2;
2352
2353 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2354 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2355 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2356
2357 if (WaitStatesNeeded == MaxWaitStates)
2358 return WaitStatesNeeded; // Early exit.
2359
2360 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2361 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2362 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2363 return false;
2364 auto IsVALUFn = [](const MachineInstr &MI) {
2366 };
2367 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2368 std::numeric_limits<int>::max();
2369 };
2370
2371 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2372 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2373 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2374 }
2375
2376 return WaitStatesNeeded;
2377}
2378
2379int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2380 if (!ST.hasGFX90AInsts())
2381 return 0;
2382
2383 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2384 return isDGEMM(MI.getOpcode());
2385 };
2386
2387 // This is checked in checkMAIHazards90A()
2388 if (SIInstrInfo::isMFMA(*MI))
2389 return 0;
2390
2391 const MachineRegisterInfo &MRI = MF.getRegInfo();
2392
2393 int WaitStatesNeeded = 0;
2394
2395 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2398 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2399 bool IsVALU = SIInstrInfo::isVALU(*MI);
2400
2401 const MachineInstr *MFMA = nullptr;
2402 unsigned Reg;
2403 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2404 if (!SIInstrInfo::isMFMA(MI) ||
2405 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2406 return false;
2407 MFMA = &MI;
2408 return true;
2409 };
2410
2411 const MachineInstr *DOT = nullptr;
2412 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2413 if (!SIInstrInfo::isDOT(MI) ||
2414 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2415 return false;
2416 DOT = &MI;
2417 return true;
2418 };
2419
2420 bool DGEMMAfterVALUWrite = false;
2421 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2422 // Found DGEMM on reverse traversal to def.
2423 if (isDGEMM(MI.getOpcode()))
2424 DGEMMAfterVALUWrite = true;
2425
2426 // Only hazard if register is defined by a VALU and a DGEMM is found after
2427 // after the def.
2428 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2429 return false;
2430
2431 return true;
2432 };
2433
2434 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2435 AMDGPU::OpName::src2);
2436
2437 if (IsMemOrExport || IsVALU) {
2438 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2439 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2440 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2441 const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2442 const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2443 const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2444 const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2445 const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2446 const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2447 const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2448 const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2449 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2450 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2451 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2452 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2453 const int DotWriteSameDotReadSrcAB = 3;
2454 const int DotWriteDifferentVALURead = 3;
2455 const int DMFMABetweenVALUWriteVMEMRead = 2;
2456 const int MaxWaitStates = 19;
2457
2458 for (const MachineOperand &Use : MI->explicit_uses()) {
2459 if (!Use.isReg())
2460 continue;
2461 Reg = Use.getReg();
2462
2463 DOT = nullptr;
2464 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2465 MaxWaitStates);
2466 if (DOT) {
2467 int NeedWaitStates = 0;
2468 if (DOT->getOpcode() == MI->getOpcode()) {
2469 if (&Use - &MI->getOperand(0) != SrcCIdx)
2470 NeedWaitStates = DotWriteSameDotReadSrcAB;
2471 } else {
2472 NeedWaitStates = DotWriteDifferentVALURead;
2473 }
2474
2475 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2476 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2477 }
2478
2479 // Workaround for HW data hazard bug observed only in GFX90A. When there
2480 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2481 // causes the SQ to incorrectly not insert two wait states between the two
2482 // instructions needed to avoid data hazard.
2483 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2484 DGEMMAfterVALUWrite = false;
2485 if (TRI.isVectorRegister(MRI, Reg)) {
2486 int WaitStatesNeededForUse =
2487 DMFMABetweenVALUWriteVMEMRead -
2488 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2489 DMFMABetweenVALUWriteVMEMRead);
2490
2491 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2492 }
2493 }
2494
2495 MFMA = nullptr;
2496 WaitStatesSinceDef =
2497 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2498 if (!MFMA)
2499 continue;
2500
2501 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2502 int NeedWaitStates = MaxWaitStates;
2503 switch (HazardDefLatency) {
2504 case 2:
2505 NeedWaitStates =
2506 ST.hasGFX940Insts()
2507 ? isXDL(ST, *MFMA)
2508 ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2509 : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2510 : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2511 break;
2512 case 4:
2513 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2514 NeedWaitStates =
2515 isDGEMM(MFMA->getOpcode())
2516 ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2517 : DMFMA4x4WriteVgprVALUReadWaitStates
2518 : isXDL(ST, *MFMA)
2519 ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2520 : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2521 break;
2522 case 8:
2523 NeedWaitStates =
2524 ST.hasGFX940Insts()
2525 ? isXDL(ST, *MFMA)
2526 ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2527 : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2528 : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2529 break;
2530 case 16: [[fallthrough]];
2531 default:
2532 NeedWaitStates =
2533 isDGEMM(MFMA->getOpcode())
2534 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2535 : DMFMA16x16WriteVgprVALUReadWaitStates
2536 : ST.hasGFX940Insts()
2537 ? isXDL(ST, *MFMA)
2538 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2539 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2540 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2541 break;
2542 }
2543
2544 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2545 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2546
2547 if (WaitStatesNeeded == MaxWaitStates)
2548 break;
2549 }
2550 }
2551
2552 unsigned Opc = MI->getOpcode();
2553 const int DMFMAToFMA64WaitStates = 2;
2554 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2555 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2556 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2557 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2558 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2559 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2560 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2561 }
2562
2563 if (!IsVALU && !IsMemOrExport)
2564 return WaitStatesNeeded;
2565
2566 for (const MachineOperand &Def : MI->defs()) {
2567 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2568 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2569 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2570 const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2571 const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2572 const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2573 const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2574 const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2575 const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2576 const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2577 const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2578 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2579 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2580 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2581 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2582 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2583 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2584 const int DotWriteDifferentVALUWrite = 3;
2585 const int MaxWaitStates = 19;
2586 const int MaxWarWaitStates = 15;
2587
2588 Reg = Def.getReg();
2589
2590 DOT = nullptr;
2591 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2592 MaxWaitStates);
2593 if (DOT && DOT->getOpcode() != MI->getOpcode())
2594 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2595 WaitStatesSinceDef);
2596
2597 MFMA = nullptr;
2598 WaitStatesSinceDef =
2599 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2600 if (MFMA) {
2601 int NeedWaitStates = MaxWaitStates;
2602 switch (TSchedModel.computeInstrLatency(MFMA)) {
2603 case 2:
2604 NeedWaitStates = ST.hasGFX940Insts()
2605 ? isXDL(ST, *MFMA)
2606 ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2607 : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2608 : SMFMA4x4WriteVgprVALUWawWaitStates;
2609 break;
2610 case 4:
2611 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2612 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2613 ? DMFMA4x4WriteVgprVALUWriteWaitStates
2614 : isXDL(ST, *MFMA)
2615 ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2616 : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2617 break;
2618 case 8:
2619 NeedWaitStates = ST.hasGFX940Insts()
2620 ? isXDL(ST, *MFMA)
2621 ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2622 : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2623 : SMFMA16x16WriteVgprVALUWawWaitStates;
2624 break;
2625 case 16: [[fallthrough]];
2626 default:
2627 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2628 ? DMFMA16x16WriteVgprVALUWriteWaitStates
2629 : ST.hasGFX940Insts()
2630 ? isXDL(ST, *MFMA)
2631 ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2632 : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2633 : SMFMA32x32WriteVgprVALUWawWaitStates;
2634 break;
2635 }
2636
2637 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2638 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2639
2640 if (WaitStatesNeeded == MaxWaitStates)
2641 break;
2642 }
2643
2644 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2645 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2646 !MI.readsRegister(Reg, &TRI))
2647 return false;
2648
2649 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2650 return false;
2651
2652 const MachineOperand *SrcC =
2653 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2654 assert(SrcC);
2655 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2656 return false;
2657
2658 MFMA = &MI;
2659 return true;
2660 };
2661
2662 MFMA = nullptr;
2663 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2664 MaxWarWaitStates);
2665 if (!MFMA)
2666 continue;
2667
2668 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2669 int NeedWaitStates = MaxWaitStates;
2670 switch (HazardDefLatency) {
2671 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2672 break;
2673 case 4: assert(ST.hasGFX940Insts());
2674 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2675 break;
2676 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2677 break;
2678 case 16: [[fallthrough]];
2679 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2680 break;
2681 }
2682
2683 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2684 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2685 }
2686
2687 return WaitStatesNeeded;
2688}
2689
2691 if (!SU->isInstr())
2692 return false;
2693
2694 const MachineInstr *MAI = nullptr;
2695
2696 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2697 MAI = nullptr;
2699 MAI = &MI;
2700 return MAI != nullptr;
2701 };
2702
2703 MachineInstr *MI = SU->getInstr();
2704 if (IsMFMAFn(*MI)) {
2705 int W = getWaitStatesSince(IsMFMAFn, 16);
2706 if (MAI)
2707 return W < (int)TSchedModel.computeInstrLatency(MAI);
2708 }
2709
2710 return false;
2711}
2712
2713bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2714 if (!ST.isWave64())
2715 return false;
2716 if (!ST.hasVALUMaskWriteHazard())
2717 return false;
2718 if (!SIInstrInfo::isSALU(*MI))
2719 return false;
2720
2721 // The hazard sequence is three instructions:
2722 // 1. VALU reads SGPR as mask
2723 // 2. SALU writes SGPR
2724 // 3. SALU reads SGPR
2725 // The hazard can expire if the distance between 2 and 3 is sufficient.
2726 // In practice this happens <10% of the time, hence this always assumes
2727 // the hazard exists if 1 and 2 are present to avoid searching.
2728
2729 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2730 if (!SDSTOp || !SDSTOp->isReg())
2731 return false;
2732
2733 const Register HazardReg = SDSTOp->getReg();
2734 if (HazardReg == AMDGPU::EXEC ||
2735 HazardReg == AMDGPU::EXEC_LO ||
2736 HazardReg == AMDGPU::EXEC_HI ||
2737 HazardReg == AMDGPU::M0)
2738 return false;
2739
2740 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2741 switch (I.getOpcode()) {
2742 case AMDGPU::V_ADDC_U32_e32:
2743 case AMDGPU::V_ADDC_U32_dpp:
2744 case AMDGPU::V_CNDMASK_B16_e32:
2745 case AMDGPU::V_CNDMASK_B16_dpp:
2746 case AMDGPU::V_CNDMASK_B32_e32:
2747 case AMDGPU::V_CNDMASK_B32_dpp:
2748 case AMDGPU::V_DIV_FMAS_F32_e64:
2749 case AMDGPU::V_DIV_FMAS_F64_e64:
2750 case AMDGPU::V_SUBB_U32_e32:
2751 case AMDGPU::V_SUBB_U32_dpp:
2752 case AMDGPU::V_SUBBREV_U32_e32:
2753 case AMDGPU::V_SUBBREV_U32_dpp:
2754 // These implicitly read VCC as mask source.
2755 return HazardReg == AMDGPU::VCC ||
2756 HazardReg == AMDGPU::VCC_LO ||
2757 HazardReg == AMDGPU::VCC_HI;
2758 case AMDGPU::V_ADDC_U32_e64:
2759 case AMDGPU::V_ADDC_U32_e64_dpp:
2760 case AMDGPU::V_CNDMASK_B16_e64:
2761 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2762 case AMDGPU::V_CNDMASK_B32_e64:
2763 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2764 case AMDGPU::V_SUBB_U32_e64:
2765 case AMDGPU::V_SUBB_U32_e64_dpp:
2766 case AMDGPU::V_SUBBREV_U32_e64:
2767 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2768 // Only check mask register overlaps.
2769 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2770 assert(SSRCOp);
2771 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2772 }
2773 default:
2774 return false;
2775 }
2776 };
2777
2778 const MachineRegisterInfo &MRI = MF.getRegInfo();
2779 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2780 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2781 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2782 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2783 return true;
2784
2785 // VALU access to any SGPR or literal constant other than HazardReg
2786 // mitigates hazard. No need to check HazardReg here as this will
2787 // only be called when !IsHazardFn.
2788 if (!SIInstrInfo::isVALU(I))
2789 return false;
2790 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2791 const MachineOperand &Op = I.getOperand(OpNo);
2792 if (Op.isReg()) {
2793 Register OpReg = Op.getReg();
2794 // Only consider uses
2795 if (!Op.isUse())
2796 continue;
2797 // Ignore EXEC
2798 if (OpReg == AMDGPU::EXEC ||
2799 OpReg == AMDGPU::EXEC_LO ||
2800 OpReg == AMDGPU::EXEC_HI)
2801 continue;
2802 // Ignore all implicit uses except VCC
2803 if (Op.isImplicit()) {
2804 if (OpReg == AMDGPU::VCC ||
2805 OpReg == AMDGPU::VCC_LO ||
2806 OpReg == AMDGPU::VCC_HI)
2807 return true;
2808 continue;
2809 }
2810 if (TRI.isSGPRReg(MRI, OpReg))
2811 return true;
2812 } else {
2813 const MCInstrDesc &InstDesc = I.getDesc();
2814 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2815 if (!TII.isInlineConstant(Op, OpInfo))
2816 return true;
2817 }
2818 }
2819 return false;
2820 };
2821
2822 // Check for hazard
2823 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2824 std::numeric_limits<int>::max())
2825 return false;
2826
2827 auto NextMI = std::next(MI->getIterator());
2828
2829 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2830 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2831 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2833
2834 // SALU write may be s_getpc in a bundle.
2835 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2836 // Update offsets of any references in the bundle.
2837 while (NextMI != MI->getParent()->end() &&
2838 NextMI->isBundledWithPred()) {
2839 for (auto &Operand : NextMI->operands()) {
2840 if (Operand.isGlobal())
2841 Operand.setOffset(Operand.getOffset() + 4);
2842 }
2843 NextMI++;
2844 }
2845 }
2846
2847 return true;
2848}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:469
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
@ HazardExpired
@ NoHazardFound
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:754
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:232
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:244
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:816
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:469
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:565
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:454
bool hasRFEHazards() const
Definition: GCNSubtarget.h:464
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:460
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
reverse_instr_iterator instr_rend()
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:710
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:526
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:404
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:516
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:508
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:597
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:388
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:484
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:702
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:723
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:743
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:686
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:500
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:566
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:678
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:718
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:825
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:727
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:396
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:474
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
An efficient, type-erasing, non-owning reference to a callable.
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
Definition: ilist_node.h:82
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
constexpr double e
Definition: MathExtras.h:31
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:330
@ Offset
Definition: DWP.cpp:440
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Instruction set architecture version.
Definition: TargetParser.h:114
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:68
Definition: regcomp.c:192