LLVM 19.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
20
21using namespace llvm;
22
23namespace {
24
25struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(0, Value))
30 return O.error("'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37};
38
39} // end anonymous namespace
40
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46//===----------------------------------------------------------------------===//
47// Hazard Recognizer Implementation
48//===----------------------------------------------------------------------===//
49
51 const GCNSubtarget &ST);
52
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65}
66
68 EmittedInstrs.clear();
69}
70
73}
74
76 CurrCycleInstr = MI;
77}
78
79static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81}
82
83static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85}
86
87static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96}
97
98static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100}
101
102static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104}
105
106static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116}
117
118static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opcode);
120}
121
122static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opcode);
135}
136
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
170}
171
172static bool isLdsDma(const MachineInstr &MI) {
173 return SIInstrInfo::isVALU(MI) &&
175}
176
177static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
178 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
179 AMDGPU::OpName::simm16);
180 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
181}
182
185 MachineInstr *MI = SU->getInstr();
186 // If we are not in "HazardRecognizerMode" and therefore not being run from
187 // the scheduler, track possible stalls from hazards but don't insert noops.
188 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189
190 if (MI->isBundle())
191 return NoHazard;
192
193 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
194 return HazardType;
195
196 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
197 return HazardType;
198
199 if (checkFPAtomicToDenormModeHazard(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNoDataDepHazard())
203 return NoHazard;
204
205 // FIXME: Should flat be considered vmem?
206 if ((SIInstrInfo::isVMEM(*MI) ||
208 && checkVMEMHazards(MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
212 return HazardType;
213
214 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
215 return HazardType;
216
217 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
218 return HazardType;
219
220 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
221 return HazardType;
222
225 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
226 return HazardType;
227
228 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
229 return HazardType;
230
231 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
232 return HazardType;
233
234 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
235 return HazardType;
236
237 if (((ST.hasReadM0MovRelInterpHazard() &&
238 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
242 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
244 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
245 checkReadM0Hazards(MI) > 0)
246 return HazardType;
247
248 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
249 return HazardType;
250
251 if ((SIInstrInfo::isVMEM(*MI) ||
253 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
254 return HazardType;
255
256 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
257 return HazardType;
258
259 return NoHazard;
260}
261
263 unsigned Quantity) {
264 while (Quantity > 0) {
265 unsigned Arg = std::min(Quantity, 8u);
266 Quantity -= Arg;
267 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268 .addImm(Arg - 1);
269 }
270}
271
272unsigned
273GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
274 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
275 assert(TSchedModel.getWriteProcResBegin(SC) !=
276 TSchedModel.getWriteProcResEnd(SC));
277 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
278}
279
280void GCNHazardRecognizer::processBundle() {
281 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
283 // Check bundled MachineInstr's for hazards.
284 for (; MI != E && MI->isInsideBundle(); ++MI) {
285 CurrCycleInstr = &*MI;
286 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
287
288 if (IsHazardRecognizerMode) {
289 fixHazards(CurrCycleInstr);
290
291 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
292 }
293
294 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
295 // include the bundled MI directly after, only add a maximum of
296 // (MaxLookAhead - 1) noops to EmittedInstrs.
297 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
298 EmittedInstrs.push_front(nullptr);
299
300 EmittedInstrs.push_front(CurrCycleInstr);
301 EmittedInstrs.resize(MaxLookAhead);
302 }
303 CurrCycleInstr = nullptr;
304}
305
306void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307 assert(IsHazardRecognizerMode);
308
309 unsigned NumPreNoops = PreEmitNoops(MI);
310 EmitNoops(NumPreNoops);
311 if (MI->isInsideBundle())
312 insertNoopsInBundle(MI, TII, NumPreNoops);
313 else
314 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
315 NumPreNoops);
317 AdvanceCycle();
318}
319
321 IsHazardRecognizerMode = true;
322 CurrCycleInstr = MI;
323 unsigned W = PreEmitNoopsCommon(MI);
324 fixHazards(MI);
325 CurrCycleInstr = nullptr;
326 return W;
327}
328
330 if (MI->isBundle())
331 return 0;
332
333 int WaitStates = 0;
334
336 return std::max(WaitStates, checkSMRDHazards(MI));
337
338 if (ST.hasNSAtoVMEMBug())
339 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
340
341 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
342
343 if (ST.hasNoDataDepHazard())
344 return WaitStates;
345
347 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
348
350 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
351
353 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
354
355 if (isDivFMas(MI->getOpcode()))
356 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
357
358 if (isRWLane(MI->getOpcode()))
359 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
360
363 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
364 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
365
366 if (MI->isInlineAsm())
367 return std::max(WaitStates, checkInlineAsmHazards(MI));
368
369 if (isSGetReg(MI->getOpcode()))
370 return std::max(WaitStates, checkGetRegHazards(MI));
371
372 if (isSSetReg(MI->getOpcode()))
373 return std::max(WaitStates, checkSetRegHazards(MI));
374
375 if (isRFE(MI->getOpcode()))
376 return std::max(WaitStates, checkRFEHazards(MI));
377
378 if ((ST.hasReadM0MovRelInterpHazard() &&
379 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
383 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
384 (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
385 return std::max(WaitStates, checkReadM0Hazards(MI));
386
388 return std::max(WaitStates, checkMAIHazards(MI));
389
390 if (SIInstrInfo::isVMEM(*MI) ||
393 return std::max(WaitStates, checkMAILdStHazards(MI));
394
395 return WaitStates;
396}
397
399 EmittedInstrs.push_front(nullptr);
400}
401
403 // When the scheduler detects a stall, it will call AdvanceCycle() without
404 // emitting any instructions.
405 if (!CurrCycleInstr) {
406 EmittedInstrs.push_front(nullptr);
407 return;
408 }
409
410 if (CurrCycleInstr->isBundle()) {
411 processBundle();
412 return;
413 }
414
415 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
416 if (!NumWaitStates) {
417 CurrCycleInstr = nullptr;
418 return;
419 }
420
421 // Keep track of emitted instructions
422 EmittedInstrs.push_front(CurrCycleInstr);
423
424 // Add a nullptr for each additional wait state after the first. Make sure
425 // not to add more than getMaxLookAhead() items to the list, since we
426 // truncate the list to that size right after this loop.
427 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
428 i < e; ++i) {
429 EmittedInstrs.push_front(nullptr);
430 }
431
432 // getMaxLookahead() is the largest number of wait states we will ever need
433 // to insert, so there is no point in keeping track of more than that many
434 // wait states.
435 EmittedInstrs.resize(getMaxLookAhead());
436
437 CurrCycleInstr = nullptr;
438}
439
441 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
442}
443
444//===----------------------------------------------------------------------===//
445// Helper Functions
446//===----------------------------------------------------------------------===//
447
449
450typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
451typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
452
453// Search for a hazard in a block and its predecessors.
454template <typename StateT>
455static bool
456hasHazard(StateT State,
457 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
458 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
459 const MachineBasicBlock *MBB,
462 for (auto E = MBB->instr_rend(); I != E; ++I) {
463 // No need to look at parent BUNDLE instructions.
464 if (I->isBundle())
465 continue;
466
467 switch (IsHazard(State, *I)) {
468 case HazardFound:
469 return true;
470 case HazardExpired:
471 return false;
472 default:
473 // Continue search
474 break;
475 }
476
477 if (I->isInlineAsm() || I->isMetaInstruction())
478 continue;
479
480 UpdateState(State, *I);
481 }
482
483 for (MachineBasicBlock *Pred : MBB->predecessors()) {
484 if (!Visited.insert(Pred).second)
485 continue;
486
487 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
488 Visited))
489 return true;
490 }
491
492 return false;
493}
494
495// Returns a minimum wait states since \p I walking all predecessors.
496// Only scans until \p IsExpired does not return true.
497// Can only be run in a hazard recognizer mode.
503 for (auto E = MBB->instr_rend(); I != E; ++I) {
504 // Don't add WaitStates for parent BUNDLE instructions.
505 if (I->isBundle())
506 continue;
507
508 if (IsHazard(*I))
509 return WaitStates;
510
511 if (I->isInlineAsm())
512 continue;
513
514 WaitStates += GetNumWaitStates(*I);
515
516 if (IsExpired(*I, WaitStates))
517 return std::numeric_limits<int>::max();
518 }
519
520 int MinWaitStates = std::numeric_limits<int>::max();
521 for (MachineBasicBlock *Pred : MBB->predecessors()) {
522 if (!Visited.insert(Pred).second)
523 continue;
524
525 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
526 IsExpired, Visited, GetNumWaitStates);
527
528 MinWaitStates = std::min(MinWaitStates, W);
529 }
530
531 return MinWaitStates;
532}
533
535 const MachineInstr *MI, IsExpiredFn IsExpired) {
537 return getWaitStatesSince(IsHazard, MI->getParent(),
538 std::next(MI->getReverseIterator()),
539 0, IsExpired, Visited);
540}
541
542int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
543 if (IsHazardRecognizerMode) {
544 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
545 return WaitStates >= Limit;
546 };
547 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
548 }
549
550 int WaitStates = 0;
551 for (MachineInstr *MI : EmittedInstrs) {
552 if (MI) {
553 if (IsHazard(*MI))
554 return WaitStates;
555
556 if (MI->isInlineAsm())
557 continue;
558 }
559 ++WaitStates;
560
561 if (WaitStates >= Limit)
562 break;
563 }
564 return std::numeric_limits<int>::max();
565}
566
567int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
568 IsHazardFn IsHazardDef,
569 int Limit) {
570 const SIRegisterInfo *TRI = ST.getRegisterInfo();
571
572 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
573 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
574 };
575
576 return getWaitStatesSince(IsHazardFn, Limit);
577}
578
579int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
580 int Limit) {
581 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
582 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
583 };
584
585 return getWaitStatesSince(IsHazardFn, Limit);
586}
587
588//===----------------------------------------------------------------------===//
589// No-op Hazard Detection
590//===----------------------------------------------------------------------===//
591
592static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
593 MCRegister Reg) {
594 for (MCRegUnit Unit : TRI.regunits(Reg))
595 BV.set(Unit);
596}
597
598static void addRegsToSet(const SIRegisterInfo &TRI,
600 BitVector &DefSet, BitVector &UseSet) {
601 for (const MachineOperand &Op : Ops) {
602 if (Op.isReg())
603 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
604 }
605}
606
607void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
608 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
609}
610
612 return !SIInstrInfo::isSMRD(*MI);
613}
614
617}
618
619int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
620 // SMEM soft clause are only present on VI+, and only matter if xnack is
621 // enabled.
622 if (!ST.isXNACKEnabled())
623 return 0;
624
625 bool IsSMRD = TII.isSMRD(*MEM);
626
627 resetClause();
628
629 // A soft-clause is any group of consecutive SMEM instructions. The
630 // instructions in this group may return out of order and/or may be
631 // replayed (i.e. the same instruction issued more than once).
632 //
633 // In order to handle these situations correctly we need to make sure that
634 // when a clause has more than one instruction, no instruction in the clause
635 // writes to a register that is read by another instruction in the clause
636 // (including itself). If we encounter this situation, we need to break the
637 // clause by inserting a non SMEM instruction.
638
639 for (MachineInstr *MI : EmittedInstrs) {
640 // When we hit a non-SMEM instruction then we have passed the start of the
641 // clause and we can stop.
642 if (!MI)
643 break;
644
646 break;
647
648 addClauseInst(*MI);
649 }
650
651 if (ClauseDefs.none())
652 return 0;
653
654 // We need to make sure not to put loads and stores in the same clause if they
655 // use the same address. For now, just start a new clause whenever we see a
656 // store.
657 if (MEM->mayStore())
658 return 1;
659
660 addClauseInst(*MEM);
661
662 // If the set of defs and uses intersect then we cannot add this instruction
663 // to the clause, so we have a hazard.
664 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
665}
666
667int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
668 int WaitStatesNeeded = 0;
669
670 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
671
672 // This SMRD hazard only affects SI.
673 if (!ST.hasSMRDReadVALUDefHazard())
674 return WaitStatesNeeded;
675
676 // A read of an SGPR by SMRD instruction requires 4 wait states when the
677 // SGPR was written by a VALU instruction.
678 int SmrdSgprWaitStates = 4;
679 auto IsHazardDefFn = [this](const MachineInstr &MI) {
680 return TII.isVALU(MI);
681 };
682 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
683 return TII.isSALU(MI);
684 };
685
686 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
687
688 for (const MachineOperand &Use : SMRD->uses()) {
689 if (!Use.isReg())
690 continue;
691 int WaitStatesNeededForUse =
692 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
693 SmrdSgprWaitStates);
694 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
695
696 // This fixes what appears to be undocumented hardware behavior in SI where
697 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
698 // needs some number of nops in between. We don't know how many we need, but
699 // let's use 4. This wasn't discovered before probably because the only
700 // case when this happens is when we expand a 64-bit pointer into a full
701 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
702 // probably never encountered in the closed-source land.
703 if (IsBufferSMRD) {
704 int WaitStatesNeededForUse =
705 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
706 IsBufferHazardDefFn,
707 SmrdSgprWaitStates);
708 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
709 }
710 }
711
712 return WaitStatesNeeded;
713}
714
715int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
717 return 0;
718
719 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
720
721 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
722 // SGPR was written by a VALU Instruction.
723 const int VmemSgprWaitStates = 5;
724 auto IsHazardDefFn = [this](const MachineInstr &MI) {
725 return TII.isVALU(MI);
726 };
727 for (const MachineOperand &Use : VMEM->uses()) {
728 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
729 continue;
730
731 int WaitStatesNeededForUse =
732 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
733 VmemSgprWaitStates);
734 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
735 }
736 return WaitStatesNeeded;
737}
738
739int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
740 const SIRegisterInfo *TRI = ST.getRegisterInfo();
741 const SIInstrInfo *TII = ST.getInstrInfo();
742
743 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
744 int DppVgprWaitStates = 2;
745 int DppExecWaitStates = 5;
746 int WaitStatesNeeded = 0;
747 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
748 return TII->isVALU(MI);
749 };
750
751 for (const MachineOperand &Use : DPP->uses()) {
752 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
753 continue;
754 int WaitStatesNeededForUse =
755 DppVgprWaitStates - getWaitStatesSinceDef(
756 Use.getReg(),
757 [](const MachineInstr &) { return true; },
758 DppVgprWaitStates);
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
760 }
761
762 WaitStatesNeeded = std::max(
763 WaitStatesNeeded,
764 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
765 DppExecWaitStates));
766
767 return WaitStatesNeeded;
768}
769
770int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
771 const SIInstrInfo *TII = ST.getInstrInfo();
772
773 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
774 // instruction.
775 const int DivFMasWaitStates = 4;
776 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
777 return TII->isVALU(MI);
778 };
779 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
780 DivFMasWaitStates);
781
782 return DivFMasWaitStates - WaitStatesNeeded;
783}
784
785int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
786 const SIInstrInfo *TII = ST.getInstrInfo();
787 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
788
789 const int GetRegWaitStates = 2;
790 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
791 return GetRegHWReg == getHWReg(TII, MI);
792 };
793 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
794
795 return GetRegWaitStates - WaitStatesNeeded;
796}
797
798int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
799 const SIInstrInfo *TII = ST.getInstrInfo();
800 unsigned HWReg = getHWReg(TII, *SetRegInstr);
801
802 const int SetRegWaitStates = ST.getSetRegWaitStates();
803 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
804 return HWReg == getHWReg(TII, MI);
805 };
806 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
807 return SetRegWaitStates - WaitStatesNeeded;
808}
809
810int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
811 if (!MI.mayStore())
812 return -1;
813
814 const SIInstrInfo *TII = ST.getInstrInfo();
815 unsigned Opcode = MI.getOpcode();
816 const MCInstrDesc &Desc = MI.getDesc();
817
818 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
819 int VDataRCID = -1;
820 if (VDataIdx != -1)
821 VDataRCID = Desc.operands()[VDataIdx].RegClass;
822
823 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
824 // There is no hazard if the instruction does not use vector regs
825 // (like wbinvl1)
826 if (VDataIdx == -1)
827 return -1;
828 // For MUBUF/MTBUF instructions this hazard only exists if the
829 // instruction is not using a register in the soffset field.
830 const MachineOperand *SOffset =
831 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
832 // If we have no soffset operand, then assume this field has been
833 // hardcoded to zero.
834 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
835 (!SOffset || !SOffset->isReg()))
836 return VDataIdx;
837 }
838
839 // MIMG instructions create a hazard if they don't use a 256-bit T# and
840 // the store size is greater than 8 bytes and they have more than two bits
841 // of their dmask set.
842 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
843 if (TII->isMIMG(MI)) {
844 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
845 assert(SRsrcIdx != -1 &&
846 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
847 (void)SRsrcIdx;
848 }
849
850 if (TII->isFLAT(MI)) {
851 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
852 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
853 return DataIdx;
854 }
855
856 return -1;
857}
858
859int
860GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
861 const MachineRegisterInfo &MRI) {
862 // Helper to check for the hazard where VMEM instructions that store more than
863 // 8 bytes can have there store data over written by the next instruction.
864 const SIRegisterInfo *TRI = ST.getRegisterInfo();
865
866 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
867 int WaitStatesNeeded = 0;
868
869 if (!TRI->isVectorRegister(MRI, Def.getReg()))
870 return WaitStatesNeeded;
871 Register Reg = Def.getReg();
872 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
873 int DataIdx = createsVALUHazard(MI);
874 return DataIdx >= 0 &&
875 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
876 };
877 int WaitStatesNeededForDef =
878 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
879 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
880
881 return WaitStatesNeeded;
882}
883
884int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
885 int WaitStatesNeeded = 0;
886
888 const int TransDefWaitstates = 1;
889
890 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
892 return false;
893 const SIRegisterInfo *TRI = ST.getRegisterInfo();
894 const SIInstrInfo *TII = ST.getInstrInfo();
895 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
896
897 for (const MachineOperand &Use : VALU->explicit_uses()) {
898 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
899 return true;
900 }
901
902 return false;
903 };
904
905 int WaitStatesNeededForDef =
906 TransDefWaitstates -
907 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
908 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
909 }
910
911 if (ST.hasDstSelForwardingHazard()) {
912 const int Shift16DefWaitstates = 1;
913
914 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
916 return false;
917 const SIInstrInfo *TII = ST.getInstrInfo();
918 if (SIInstrInfo::isSDWA(MI)) {
919 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
920 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
921 return false;
922 } else {
923 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
924 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
925 ->getImm() &
927 return false;
928 }
929 const SIRegisterInfo *TRI = ST.getRegisterInfo();
930 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
931 Register Def = Dst->getReg();
932
933 for (const MachineOperand &Use : VALU->explicit_uses()) {
934 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
935 return true;
936 }
937 }
938
939 return false;
940 };
941
942 int WaitStatesNeededForDef =
943 Shift16DefWaitstates -
944 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
945 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
946 }
947
948 if (ST.hasVDecCoExecHazard()) {
949 const int VALUWriteSGPRVALUReadWaitstates = 2;
950 const int VALUWriteEXECRWLane = 4;
951 const int VALUWriteVGPRReadlaneRead = 1;
952
953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
954 const MachineRegisterInfo &MRI = MF.getRegInfo();
956 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
958 return false;
959 return MI.modifiesRegister(UseReg, TRI);
960 };
961
962 for (const MachineOperand &Use : VALU->explicit_uses()) {
963 if (!Use.isReg())
964 continue;
965
966 UseReg = Use.getReg();
967 if (TRI->isSGPRReg(MRI, UseReg)) {
968 int WaitStatesNeededForDef =
969 VALUWriteSGPRVALUReadWaitstates -
970 getWaitStatesSince(IsVALUDefSGPRFn,
971 VALUWriteSGPRVALUReadWaitstates);
972 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
973 }
974 }
975
976 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
977 UseReg = AMDGPU::VCC;
978 int WaitStatesNeededForDef =
979 VALUWriteSGPRVALUReadWaitstates -
980 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
981 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
982 }
983
984 switch (VALU->getOpcode()) {
985 case AMDGPU::V_READLANE_B32:
986 case AMDGPU::V_READFIRSTLANE_B32: {
987 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
988 UseReg = Src->getReg();
989 int WaitStatesNeededForDef =
990 VALUWriteVGPRReadlaneRead -
991 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
992 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
993 }
994 [[fallthrough]];
995 case AMDGPU::V_WRITELANE_B32: {
996 UseReg = AMDGPU::EXEC;
997 int WaitStatesNeededForDef =
998 VALUWriteEXECRWLane -
999 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1000 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1001 break;
1002 }
1003 default:
1004 break;
1005 }
1006 }
1007
1008 // This checks for the hazard where VMEM instructions that store more than
1009 // 8 bytes can have there store data over written by the next instruction.
1010 if (!ST.has12DWordStoreHazard())
1011 return WaitStatesNeeded;
1012
1013 const MachineRegisterInfo &MRI = MF.getRegInfo();
1014
1015 for (const MachineOperand &Def : VALU->defs()) {
1016 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1017 }
1018
1019 return WaitStatesNeeded;
1020}
1021
1022int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1023 // This checks for hazards associated with inline asm statements.
1024 // Since inline asms can contain just about anything, we use this
1025 // to call/leverage other check*Hazard routines. Note that
1026 // this function doesn't attempt to address all possible inline asm
1027 // hazards (good luck), but is a collection of what has been
1028 // problematic thus far.
1029
1030 // see checkVALUHazards()
1031 if (!ST.has12DWordStoreHazard())
1032 return 0;
1033
1034 const MachineRegisterInfo &MRI = MF.getRegInfo();
1035 int WaitStatesNeeded = 0;
1036
1037 for (const MachineOperand &Op :
1039 if (Op.isReg() && Op.isDef()) {
1040 WaitStatesNeeded =
1041 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1042 }
1043 }
1044
1045 return WaitStatesNeeded;
1046}
1047
1048int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1049 const SIInstrInfo *TII = ST.getInstrInfo();
1050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1051 const MachineRegisterInfo &MRI = MF.getRegInfo();
1052
1053 const MachineOperand *LaneSelectOp =
1054 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1055
1056 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1057 return 0;
1058
1059 Register LaneSelectReg = LaneSelectOp->getReg();
1060 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1061
1062 const int RWLaneWaitStates = 4;
1063 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1064 RWLaneWaitStates);
1065 return RWLaneWaitStates - WaitStatesSince;
1066}
1067
1068int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1069 if (!ST.hasRFEHazards())
1070 return 0;
1071
1072 const SIInstrInfo *TII = ST.getInstrInfo();
1073
1074 const int RFEWaitStates = 1;
1075
1076 auto IsHazardFn = [TII](const MachineInstr &MI) {
1077 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1078 };
1079 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1080 return RFEWaitStates - WaitStatesNeeded;
1081}
1082
1083int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1084 const SIInstrInfo *TII = ST.getInstrInfo();
1085 const int ReadM0WaitStates = 1;
1086 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1087 return ReadM0WaitStates -
1088 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1089}
1090
1091void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1092 fixVMEMtoScalarWriteHazards(MI);
1093 fixVcmpxPermlaneHazards(MI);
1094 fixSMEMtoVectorWriteHazards(MI);
1095 fixVcmpxExecWARHazard(MI);
1096 fixLdsBranchVmemWARHazard(MI);
1097 if (ST.hasLdsDirect()) {
1098 fixLdsDirectVALUHazard(MI);
1099 fixLdsDirectVMEMHazard(MI);
1100 }
1101 fixVALUPartialForwardingHazard(MI);
1102 fixVALUTransUseHazard(MI);
1103 fixWMMAHazards(MI);
1104 fixShift64HighRegBug(MI);
1105 fixVALUMaskWriteHazard(MI);
1106}
1107
1108bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1109 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1110 return false;
1111
1112 const SIInstrInfo *TII = ST.getInstrInfo();
1113 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1114 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1115 return (TII->isVOPC(MI) ||
1116 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1117 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1118 };
1119
1120 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1121 unsigned Opc = MI.getOpcode();
1122 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1123 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1124 };
1125
1126 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1127 std::numeric_limits<int>::max())
1128 return false;
1129
1130 // V_NOP will be discarded by SQ.
1131 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1132 // which is always a VGPR and available.
1133 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1134 Register Reg = Src0->getReg();
1135 bool IsUndef = Src0->isUndef();
1136 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1137 TII->get(AMDGPU::V_MOV_B32_e32))
1138 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1139 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1140
1141 return true;
1142}
1143
1144bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1146 return false;
1148
1150 return false;
1151
1152 if (MI->getNumDefs() == 0)
1153 return false;
1154
1155 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1156
1157 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1160 return false;
1161
1162 for (const MachineOperand &Def : MI->defs()) {
1163 const MachineOperand *Op =
1164 I.findRegisterUseOperand(Def.getReg(), false, TRI);
1165 if (!Op)
1166 continue;
1167 return true;
1168 }
1169 return false;
1170 };
1171
1172 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1173 return SIInstrInfo::isVALU(MI) ||
1174 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1175 !MI.getOperand(0).getImm()) ||
1176 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1177 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1178 };
1179
1180 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1181 std::numeric_limits<int>::max())
1182 return false;
1183
1184 const SIInstrInfo *TII = ST.getInstrInfo();
1185 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1186 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1188 return true;
1189}
1190
1191bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1193 return false;
1195
1196 if (!SIInstrInfo::isVALU(*MI))
1197 return false;
1198
1199 unsigned SDSTName;
1200 switch (MI->getOpcode()) {
1201 case AMDGPU::V_READLANE_B32:
1202 case AMDGPU::V_READFIRSTLANE_B32:
1203 SDSTName = AMDGPU::OpName::vdst;
1204 break;
1205 default:
1206 SDSTName = AMDGPU::OpName::sdst;
1207 break;
1208 }
1209
1210 const SIInstrInfo *TII = ST.getInstrInfo();
1211 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1212 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1213 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1214 if (!SDST) {
1215 for (const auto &MO : MI->implicit_operands()) {
1216 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1217 SDST = &MO;
1218 break;
1219 }
1220 }
1221 }
1222
1223 if (!SDST)
1224 return false;
1225
1226 const Register SDSTReg = SDST->getReg();
1227 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1228 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1229 };
1230
1231 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1232 if (TII->isSALU(MI)) {
1233 switch (MI.getOpcode()) {
1234 case AMDGPU::S_SETVSKIP:
1235 case AMDGPU::S_VERSION:
1236 case AMDGPU::S_WAITCNT_VSCNT:
1237 case AMDGPU::S_WAITCNT_VMCNT:
1238 case AMDGPU::S_WAITCNT_EXPCNT:
1239 // These instructions cannot not mitigate the hazard.
1240 return false;
1241 case AMDGPU::S_WAITCNT_LGKMCNT:
1242 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1243 return (MI.getOperand(1).getImm() == 0) &&
1244 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1245 case AMDGPU::S_WAITCNT: {
1246 const int64_t Imm = MI.getOperand(0).getImm();
1248 // DsCnt corresponds to LGKMCnt here.
1249 return (Decoded.DsCnt == 0);
1250 }
1251 default:
1252 // SOPP instructions cannot mitigate the hazard.
1253 if (TII->isSOPP(MI))
1254 return false;
1255 // At this point the SALU can be assumed to mitigate the hazard
1256 // because either:
1257 // (a) it is independent of the at risk SMEM (breaking chain),
1258 // or
1259 // (b) it is dependent on the SMEM, in which case an appropriate
1260 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1261 // SMEM instruction.
1262 return true;
1263 }
1264 }
1265 return false;
1266 };
1267
1268 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1269 std::numeric_limits<int>::max())
1270 return false;
1271
1272 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1273 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1274 .addImm(0);
1275 return true;
1276}
1277
1278bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1279 if (!ST.hasVcmpxExecWARHazard())
1280 return false;
1282
1283 if (!SIInstrInfo::isVALU(*MI))
1284 return false;
1285
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1287 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1288 return false;
1289
1290 auto IsHazardFn = [TRI](const MachineInstr &I) {
1292 return false;
1293 return I.readsRegister(AMDGPU::EXEC, TRI);
1294 };
1295
1296 const SIInstrInfo *TII = ST.getInstrInfo();
1297 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1298 if (SIInstrInfo::isVALU(MI)) {
1299 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1300 return true;
1301 for (auto MO : MI.implicit_operands())
1302 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1303 return true;
1304 }
1305 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1306 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1307 return true;
1308 return false;
1309 };
1310
1311 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1312 std::numeric_limits<int>::max())
1313 return false;
1314
1315 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1316 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1318 return true;
1319}
1320
1322 const GCNSubtarget &ST) {
1323 if (!ST.hasLdsBranchVmemWARHazard())
1324 return false;
1325
1326 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1327 // instructions need to appear in the same function.
1328 bool HasLds = false;
1329 bool HasVmem = false;
1330 for (auto &MBB : MF) {
1331 for (auto &MI : MBB) {
1332 HasLds |= SIInstrInfo::isDS(MI);
1333 HasVmem |=
1335 if (HasLds && HasVmem)
1336 return true;
1337 }
1338 }
1339 return false;
1340}
1341
1343 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1344 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1345 !I.getOperand(1).getImm();
1346}
1347
1348bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1349 if (!RunLdsBranchVmemWARHazardFixup)
1350 return false;
1351
1354
1355 auto IsHazardInst = [](const MachineInstr &MI) {
1356 if (SIInstrInfo::isDS(MI))
1357 return 1;
1359 return 2;
1360 return 0;
1361 };
1362
1363 auto InstType = IsHazardInst(*MI);
1364 if (!InstType)
1365 return false;
1366
1367 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1368 return IsHazardInst(I) || isStoreCountWaitZero(I);
1369 };
1370
1371 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1372 if (!I.isBranch())
1373 return false;
1374
1375 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1376 auto InstType2 = IsHazardInst(I);
1377 return InstType2 && InstType != InstType2;
1378 };
1379
1380 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1381 auto InstType2 = IsHazardInst(I);
1382 if (InstType == InstType2)
1383 return true;
1384
1385 return isStoreCountWaitZero(I);
1386 };
1387
1388 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1389 std::numeric_limits<int>::max();
1390 };
1391
1392 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1393 std::numeric_limits<int>::max())
1394 return false;
1395
1396 const SIInstrInfo *TII = ST.getInstrInfo();
1397 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1398 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1399 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1400 .addImm(0);
1401
1402 return true;
1403}
1404
1405bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1407 return false;
1408
1409 const int NoHazardWaitStates = 15;
1410 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1411 const Register VDSTReg = VDST->getReg();
1412
1413 bool VisitedTrans = false;
1414 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1415 if (!SIInstrInfo::isVALU(I))
1416 return false;
1417 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1418 // Cover both WAR and WAW
1419 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1420 };
1421 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1422 if (WaitStates >= NoHazardWaitStates)
1423 return true;
1424 // Instructions which cause va_vdst==0 expire hazard
1427 };
1428 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1429 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1430 };
1431
1433 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1434 std::next(MI->getReverseIterator()), 0,
1435 IsExpiredFn, Visited, GetWaitStatesFn);
1436
1437 // Transcendentals can execute in parallel to other VALUs.
1438 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1439 if (VisitedTrans)
1440 Count = 0;
1441
1442 MachineOperand *WaitVdstOp =
1443 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1444 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1445
1446 return true;
1447}
1448
1449bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1451 return false;
1452
1453 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1454 const Register VDSTReg = VDST->getReg();
1455
1456 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1459 return false;
1460 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1461 };
1462 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1463 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1464 // according to the type of VMEM instruction.
1465 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1467 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1468 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1469 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1470 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1471 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1472 };
1473
1474 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1475 std::numeric_limits<int>::max())
1476 return false;
1477
1478 if (LdsdirCanWait) {
1479 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1480 } else {
1481 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1482 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1484 }
1485
1486 return true;
1487}
1488
1489bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1491 return false;
1493
1494 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1495 return false;
1496
1498
1499 for (const MachineOperand &Use : MI->explicit_uses()) {
1500 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1501 SrcVGPRs.insert(Use.getReg());
1502 }
1503
1504 // Only applies with >= 2 unique VGPR sources
1505 if (SrcVGPRs.size() <= 1)
1506 return false;
1507
1508 // Look for the following pattern:
1509 // Va <- VALU [PreExecPos]
1510 // intv1
1511 // Exec <- SALU [ExecPos]
1512 // intv2
1513 // Vb <- VALU [PostExecPos]
1514 // intv3
1515 // MI Va, Vb (WaitState = 0)
1516 //
1517 // Where:
1518 // intv1 + intv2 <= 2 VALUs
1519 // intv3 <= 4 VALUs
1520 //
1521 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1522
1523 const int Intv1plus2MaxVALUs = 2;
1524 const int Intv3MaxVALUs = 4;
1525 const int IntvMaxVALUs = 6;
1526 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1527
1528 struct StateType {
1530 int ExecPos = std::numeric_limits<int>::max();
1531 int VALUs = 0;
1532 };
1533
1534 StateType State;
1535
1536 // This overloads expiry testing with all the hazard detection
1537 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1538 // Too many VALU states have passed
1539 if (State.VALUs > NoHazardVALUWaitStates)
1540 return HazardExpired;
1541
1542 // Instructions which cause va_vdst==0 expire hazard
1545 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1546 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1547 return HazardExpired;
1548
1549 // Track registers writes
1550 bool Changed = false;
1551 if (SIInstrInfo::isVALU(I)) {
1552 for (Register Src : SrcVGPRs) {
1553 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1554 State.DefPos[Src] = State.VALUs;
1555 Changed = true;
1556 }
1557 }
1558 } else if (SIInstrInfo::isSALU(I)) {
1559 if (State.ExecPos == std::numeric_limits<int>::max()) {
1560 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1561 State.ExecPos = State.VALUs;
1562 Changed = true;
1563 }
1564 }
1565 }
1566
1567 // Early expiration: too many VALUs in intv3
1568 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1569 return HazardExpired;
1570
1571 // Only evaluate state if something changed
1572 if (!Changed)
1573 return NoHazardFound;
1574
1575 // Determine positions of VALUs pre/post exec change
1576 if (State.ExecPos == std::numeric_limits<int>::max())
1577 return NoHazardFound;
1578
1579 int PreExecPos = std::numeric_limits<int>::max();
1580 int PostExecPos = std::numeric_limits<int>::max();
1581
1582 for (auto Entry : State.DefPos) {
1583 int DefVALUs = Entry.second;
1584 if (DefVALUs != std::numeric_limits<int>::max()) {
1585 if (DefVALUs >= State.ExecPos)
1586 PreExecPos = std::min(PreExecPos, DefVALUs);
1587 else
1588 PostExecPos = std::min(PostExecPos, DefVALUs);
1589 }
1590 }
1591
1592 // Need a VALUs post exec change
1593 if (PostExecPos == std::numeric_limits<int>::max())
1594 return NoHazardFound;
1595
1596 // Too many VALUs in intv3?
1597 int Intv3VALUs = PostExecPos;
1598 if (Intv3VALUs > Intv3MaxVALUs)
1599 return HazardExpired;
1600
1601 // Too many VALUs in intv2?
1602 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1603 if (Intv2VALUs > Intv1plus2MaxVALUs)
1604 return HazardExpired;
1605
1606 // Need a VALUs pre exec change
1607 if (PreExecPos == std::numeric_limits<int>::max())
1608 return NoHazardFound;
1609
1610 // Too many VALUs in intv1?
1611 int Intv1VALUs = PreExecPos - State.ExecPos;
1612 if (Intv1VALUs > Intv1plus2MaxVALUs)
1613 return HazardExpired;
1614
1615 // Too many VALUs in intv1 + intv2
1616 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1617 return HazardExpired;
1618
1619 return HazardFound;
1620 };
1621 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1623 State.VALUs += 1;
1624 };
1625
1627 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1628 std::next(MI->getReverseIterator()), Visited))
1629 return false;
1630
1631 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1632 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1633 .addImm(0x0fff);
1634
1635 return true;
1636}
1637
1638bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1639 if (!ST.hasVALUTransUseHazard())
1640 return false;
1642
1643 if (!SIInstrInfo::isVALU(*MI))
1644 return false;
1645
1646 SmallSet<Register, 4> SrcVGPRs;
1647
1648 for (const MachineOperand &Use : MI->explicit_uses()) {
1649 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1650 SrcVGPRs.insert(Use.getReg());
1651 }
1652
1653 // Look for the following pattern:
1654 // Va <- TRANS VALU
1655 // intv
1656 // MI Va (WaitState = 0)
1657 //
1658 // Where:
1659 // intv <= 5 VALUs / 1 TRANS
1660 //
1661 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1662
1663 const int IntvMaxVALUs = 5;
1664 const int IntvMaxTRANS = 1;
1665
1666 struct StateType {
1667 int VALUs = 0;
1668 int TRANS = 0;
1669 };
1670
1671 StateType State;
1672
1673 // This overloads expiry testing with all the hazard detection
1674 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1675 // Too many VALU states have passed
1676 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1677 return HazardExpired;
1678
1679 // Instructions which cause va_vdst==0 expire hazard
1682 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1683 I.getOperand(0).getImm() == 0x0fff))
1684 return HazardExpired;
1685
1686 // Track registers writes
1687 if (SIInstrInfo::isTRANS(I)) {
1688 for (Register Src : SrcVGPRs) {
1689 if (I.modifiesRegister(Src, &TRI)) {
1690 return HazardFound;
1691 }
1692 }
1693 }
1694
1695 return NoHazardFound;
1696 };
1697 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1699 State.VALUs += 1;
1701 State.TRANS += 1;
1702 };
1703
1705 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1706 std::next(MI->getReverseIterator()), Visited))
1707 return false;
1708
1709 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1710 // avoided.
1711 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1712 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1714
1715 return true;
1716}
1717
1718bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1720 return false;
1721
1722 const SIInstrInfo *TII = ST.getInstrInfo();
1723 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1724
1725 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1727 return false;
1728
1729 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1730 // with the dest(matrix D) of the previous wmma.
1731 const Register CurSrc0Reg =
1732 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1733 const Register CurSrc1Reg =
1734 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1735
1736 const Register PrevDstReg =
1737 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1738
1739 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1740 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1741 return true;
1742 }
1743
1744 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1745 // but Index can't overlap with PrevDstReg.
1746 if (AMDGPU::isGFX12Plus(ST)) {
1747 if (SIInstrInfo::isSWMMAC(*MI)) {
1748 const Register CurIndex =
1749 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1750 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1751 return true;
1752 }
1753 return false;
1754 }
1755
1756 return false;
1757 };
1758
1759 auto IsExpiredFn = [](const MachineInstr &I, int) {
1760 return SIInstrInfo::isVALU(I);
1761 };
1762
1763 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1764 std::numeric_limits<int>::max())
1765 return false;
1766
1767 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1768
1769 return true;
1770}
1771
1772bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1773 if (!ST.hasShift64HighRegBug())
1774 return false;
1776
1777 switch (MI->getOpcode()) {
1778 default:
1779 return false;
1780 case AMDGPU::V_LSHLREV_B64_e64:
1781 case AMDGPU::V_LSHRREV_B64_e64:
1782 case AMDGPU::V_ASHRREV_I64_e64:
1783 break;
1784 }
1785
1786 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1787 if (!Amt->isReg())
1788 return false;
1789
1790 Register AmtReg = Amt->getReg();
1791 const MachineRegisterInfo &MRI = MF.getRegInfo();
1792 // Check if this is a last VGPR in the allocation block.
1793 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1794 return false;
1795
1796 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1797 return false;
1798
1799 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1800 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1801 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1802 bool Overlapped = OverlappedSrc || OverlappedDst;
1803
1804 assert(!OverlappedDst || !OverlappedSrc ||
1805 Src1->getReg() == MI->getOperand(0).getReg());
1807 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1808
1809 Register NewReg;
1810 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1811 : AMDGPU::VGPR_32RegClass) {
1812 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1813 NewReg = Reg;
1814 break;
1815 }
1816 }
1817
1818 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1819 : NewReg;
1820 Register NewAmtLo;
1821
1822 if (Overlapped)
1823 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1824
1825 DebugLoc DL = MI->getDebugLoc();
1826 MachineBasicBlock *MBB = MI->getParent();
1827 // Insert a full wait count because found register might be pending a wait.
1828 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1829 .addImm(0);
1830
1831 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1832 if (Overlapped)
1833 runOnInstruction(
1834 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1835 .addDef(AmtReg - 1)
1836 .addReg(AmtReg - 1, RegState::Undef)
1837 .addReg(NewAmtLo, RegState::Undef));
1838 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1839 .addDef(AmtReg)
1840 .addReg(AmtReg, RegState::Undef)
1841 .addReg(NewAmt, RegState::Undef));
1842
1843 // Instructions emitted after the current instruction will be processed by the
1844 // parent loop of the hazard recognizer in a natural way.
1845 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1846 AmtReg)
1847 .addDef(NewAmt)
1848 .addReg(NewAmt)
1849 .addReg(AmtReg);
1850 if (Overlapped)
1851 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1852 AmtReg - 1)
1853 .addDef(NewAmtLo)
1854 .addReg(NewAmtLo)
1855 .addReg(AmtReg - 1);
1856
1857 // Re-running hazard recognizer on the modified instruction is not necessary,
1858 // inserted V_SWAP_B32 has already both read and write new registers so
1859 // hazards related to these register has already been handled.
1860 Amt->setReg(NewAmt);
1861 Amt->setIsKill(false);
1862 // We do not update liveness, so verifier may see it as undef.
1863 Amt->setIsUndef();
1864 if (OverlappedDst)
1865 MI->getOperand(0).setReg(NewReg);
1866 if (OverlappedSrc) {
1867 Src1->setReg(NewReg);
1868 Src1->setIsKill(false);
1869 Src1->setIsUndef();
1870 }
1871
1872 return true;
1873}
1874
1875int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1876 int NSAtoVMEMWaitStates = 1;
1877
1878 if (!ST.hasNSAtoVMEMBug())
1879 return 0;
1880
1882 return 0;
1883
1884 const SIInstrInfo *TII = ST.getInstrInfo();
1885 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1886 if (!Offset || (Offset->getImm() & 6) == 0)
1887 return 0;
1888
1889 auto IsHazardFn = [TII](const MachineInstr &I) {
1890 if (!SIInstrInfo::isMIMG(I))
1891 return false;
1892 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1893 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1894 TII->getInstSizeInBytes(I) >= 16;
1895 };
1896
1897 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1898}
1899
1900int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1901 int FPAtomicToDenormModeWaitStates = 3;
1902
1904 return 0;
1906
1907 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1908 return 0;
1909
1910 auto IsHazardFn = [](const MachineInstr &I) {
1912 return false;
1913 return SIInstrInfo::isFPAtomic(I);
1914 };
1915
1916 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1917 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1918 return true;
1919
1920 switch (MI.getOpcode()) {
1921 case AMDGPU::S_WAITCNT:
1922 case AMDGPU::S_WAITCNT_VSCNT:
1923 case AMDGPU::S_WAITCNT_VMCNT:
1924 case AMDGPU::S_WAITCNT_EXPCNT:
1925 case AMDGPU::S_WAITCNT_LGKMCNT:
1926 case AMDGPU::S_WAIT_IDLE:
1927 return true;
1928 default:
1929 break;
1930 }
1931
1932 return false;
1933 };
1934
1935 return FPAtomicToDenormModeWaitStates -
1936 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1937}
1938
1939int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1941
1942 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1943}
1944
1945int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1946 // Early exit if no padding is requested.
1947 if (MFMAPaddingRatio == 0)
1948 return 0;
1949
1951 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1952 return 0;
1953
1954 int NeighborMFMALatency = 0;
1955 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1956 this](const MachineInstr &MI) {
1957 if (!SIInstrInfo::isMFMA(MI))
1958 return false;
1959
1960 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1961 return true;
1962 };
1963
1964 const int MaxMFMAPipelineWaitStates = 16;
1965 int WaitStatesSinceNeighborMFMA =
1966 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1967
1968 int NeighborMFMAPaddingNeeded =
1969 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1970 WaitStatesSinceNeighborMFMA;
1971
1972 return std::max(0, NeighborMFMAPaddingNeeded);
1973}
1974
1975int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1976 int WaitStatesNeeded = 0;
1977 unsigned Opc = MI->getOpcode();
1978
1979 auto IsVALUFn = [](const MachineInstr &MI) {
1980 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1981 };
1982
1983 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1984 const int LegacyVALUWritesVGPRWaitStates = 2;
1985 const int VALUWritesExecWaitStates = 4;
1986 const int MaxWaitStates = 4;
1987
1988 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1989 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1990 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1991
1992 if (WaitStatesNeeded < MaxWaitStates) {
1993 for (const MachineOperand &Use : MI->explicit_uses()) {
1994 const int MaxWaitStates = 2;
1995
1996 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1997 continue;
1998
1999 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2000 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2002
2003 if (WaitStatesNeeded == MaxWaitStates)
2004 break;
2005 }
2006 }
2007 }
2008
2009 for (const MachineOperand &Op : MI->explicit_operands()) {
2010 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2011 continue;
2012
2013 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2014 continue;
2015
2016 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2017 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2018 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2019 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2020 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2021 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2022 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2023 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2024 const int MaxWaitStates = 18;
2025 Register Reg = Op.getReg();
2026 unsigned HazardDefLatency = 0;
2027
2028 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2029 this](const MachineInstr &MI) {
2030 if (!SIInstrInfo::isMFMA(MI))
2031 return false;
2032 Register DstReg = MI.getOperand(0).getReg();
2033 if (DstReg == Reg)
2034 return false;
2035 HazardDefLatency =
2036 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2037 return TRI.regsOverlap(DstReg, Reg);
2038 };
2039
2040 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2041 MaxWaitStates);
2042 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2043 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2044 int OpNo = Op.getOperandNo();
2045 if (OpNo == SrcCIdx) {
2046 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2047 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2048 switch (HazardDefLatency) {
2049 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2050 break;
2051 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2052 break;
2053 case 16: [[fallthrough]];
2054 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2055 break;
2056 }
2057 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2058 switch (HazardDefLatency) {
2059 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2060 break;
2061 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2062 break;
2063 case 16: [[fallthrough]];
2064 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2065 break;
2066 }
2067 }
2068
2069 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2071
2072 if (WaitStatesNeeded == MaxWaitStates)
2073 return WaitStatesNeeded; // Early exit.
2074
2075 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2076 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2077 return false;
2078 Register DstReg = MI.getOperand(0).getReg();
2079 return TRI.regsOverlap(Reg, DstReg);
2080 };
2081
2082 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2083 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2084 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2085 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2086 if (OpNo == SrcCIdx)
2087 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2088 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2089 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2090
2091 WaitStatesNeededForUse = NeedWaitStates -
2092 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2093 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2094
2095 if (WaitStatesNeeded == MaxWaitStates)
2096 return WaitStatesNeeded; // Early exit.
2097 }
2098
2099 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2100 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2101 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2102 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2103 const int MaxWaitStates = 13;
2104 Register DstReg = MI->getOperand(0).getReg();
2105 unsigned HazardDefLatency = 0;
2106
2107 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2108 this](const MachineInstr &MI) {
2109 if (!SIInstrInfo::isMFMA(MI))
2110 return false;
2111 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2112 HazardDefLatency =
2113 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2114 return TRI.regsOverlap(Reg, DstReg);
2115 };
2116
2117 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2118 int NeedWaitStates;
2119 switch (HazardDefLatency) {
2120 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2121 break;
2122 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2123 break;
2124 case 16: [[fallthrough]];
2125 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2126 break;
2127 }
2128
2129 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2130 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2131 }
2132
2133 // Pad neighboring MFMA with noops for better inter-wave performance.
2134 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2135
2136 return WaitStatesNeeded;
2137}
2138
2139static int
2141 // 2 pass -> 3
2142 // 4 pass -> 5
2143 // 8 pass -> 9
2144 // 16 pass -> 17
2145 return NumPasses + 1;
2146}
2147
2148static int
2150 // 2 pass -> 2
2151 // 4 pass -> 4
2152 // 8 pass -> 8
2153 // 16 pass -> 16
2154 return NumPasses;
2155}
2156
2157static int
2159 // 2 pass -> 4
2160 // 4 pass -> 6
2161 // 8 pass -> 10
2162 // 16 pass -> 18
2163 return NumPasses + 2;
2164}
2165
2167 // 2 pass -> 5
2168 // 4 pass -> 7
2169 // 8 pass -> 11
2170 // 16 pass -> 19
2171 return NumPasses + 3;
2172}
2173
2174int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2175 int WaitStatesNeeded = 0;
2176 unsigned Opc = MI->getOpcode();
2177
2178 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2180 };
2181
2182 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2185 };
2186
2187 if (!SIInstrInfo::isMFMA(*MI))
2188 return WaitStatesNeeded;
2189
2190 const int VALUWritesExecWaitStates = 4;
2191 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2192 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2193 VALUWritesExecWaitStates);
2194 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2195
2196 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2197
2198 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2199 for (const MachineOperand &Use : MI->explicit_uses()) {
2200 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2201 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2202 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2203 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2204 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2205 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2206 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2207 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2208 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2209 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2210 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2211 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2212 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2213 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2214 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2215 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2216 const int MaxWaitStates = 19;
2217
2218 if (!Use.isReg())
2219 continue;
2220 Register Reg = Use.getReg();
2221 bool FullReg;
2222 const MachineInstr *MI1;
2223
2224 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2225 this](const MachineInstr &MI) {
2226 if (!SIInstrInfo::isMFMA(MI))
2227 return false;
2228 Register DstReg = MI.getOperand(0).getReg();
2229 FullReg = (DstReg == Reg);
2230 MI1 = &MI;
2231 return TRI.regsOverlap(DstReg, Reg);
2232 };
2233
2234 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2235 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2236 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2237
2238 int NumWaitStates =
2239 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2240 if (NumWaitStates == std::numeric_limits<int>::max())
2241 continue;
2242
2243 int OpNo = Use.getOperandNo();
2244 unsigned Opc1 = MI1->getOpcode();
2245 int NeedWaitStates = 0;
2246 if (OpNo == SrcCIdx) {
2247 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2248 NeedWaitStates = 0;
2249 } else if (FullReg) {
2250 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2251 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2252 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2253 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2254 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2255 else if (ST.hasGFX940Insts() &&
2256 TSchedModel.computeInstrLatency(MI1) == 2)
2257 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2258 } else {
2259 switch (Opc1) {
2260 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2261 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2262 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2264 if (!isXDL(ST, *MI))
2265 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2266 break;
2267 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2268 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2269 if (!isXDL(ST, *MI))
2270 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2271 break;
2272 default:
2273 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2274 if (ST.hasGFX940Insts()) {
2275 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2276 break;
2277
2278 NeedWaitStates =
2279 isXDL(ST, *MI1)
2281 NumPasses)
2283 NumPasses);
2284 break;
2285 }
2286
2287 switch (NumPasses) {
2288 case 2:
2289 NeedWaitStates =
2290 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2291 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2292 break;
2293 case 8:
2294 NeedWaitStates =
2295 isDGEMM(Opc)
2296 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2297 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2298 break;
2299 case 16:
2300 NeedWaitStates =
2301 isDGEMM(Opc)
2302 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2303 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2304 break;
2305 default:
2306 llvm_unreachable("unexpected number of passes");
2307 }
2308 }
2309 }
2310 } else {
2311 switch (Opc1) {
2312 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2313 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2314 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2316 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2317 break;
2318 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2319 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2320 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2321 break;
2322 default:
2323 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2324
2325 if (ST.hasGFX940Insts()) {
2326 NeedWaitStates =
2327 isXDL(ST, *MI1)
2329 NumPasses)
2331 NumPasses);
2332 break;
2333 }
2334
2335 switch (NumPasses) {
2336 case 2:
2337 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2338 break;
2339 case 4:
2340 llvm_unreachable("unexpected number of passes for mfma");
2341 case 8:
2342 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2343 break;
2344 case 16: [[fallthrough]];
2345 default:
2346 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2347 }
2348 }
2349 }
2350 if (WaitStatesNeeded >= NeedWaitStates)
2351 continue;
2352
2353 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2354 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2355
2356 if (WaitStatesNeeded == MaxWaitStates)
2357 break;
2358 }
2359
2360 return WaitStatesNeeded;
2361}
2362
2363int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2364 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2365 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2366 return 0;
2367
2368 int WaitStatesNeeded = 0;
2369
2370 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2371 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2372 };
2373
2374 for (const MachineOperand &Op : MI->explicit_uses()) {
2375 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2376 continue;
2377
2378 Register Reg = Op.getReg();
2379
2380 const int AccVgprReadLdStWaitStates = 2;
2381 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2382 const int MaxWaitStates = 2;
2383
2384 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2385 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2386 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2387
2388 if (WaitStatesNeeded == MaxWaitStates)
2389 return WaitStatesNeeded; // Early exit.
2390
2391 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2392 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2393 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2394 return false;
2395 auto IsVALUFn = [](const MachineInstr &MI) {
2397 };
2398 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2399 std::numeric_limits<int>::max();
2400 };
2401
2402 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2403 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2404 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2405 }
2406
2407 return WaitStatesNeeded;
2408}
2409
2411 // 2 pass -> 4
2412 // 4 pass -> 6
2413 // 8 pass -> 10
2414 // 16 pass -> 18
2415 return NumPasses + 2;
2416}
2417
2419 // 2 pass -> 5
2420 // 4 pass -> 7
2421 // 8 pass -> 11
2422 // 16 pass -> 19
2423 return NumPasses + 3;
2424}
2425
2427 // 2 pass -> 5
2428 // 4 pass -> 7
2429 // 8 pass -> 11
2430 // 16 pass -> 19
2431 return NumPasses + 3;
2432}
2433
2435 // 2 pass -> 4
2436 // 4 pass -> 6
2437 // 8 pass -> 10
2438 // 16 pass -> 18
2439 return NumPasses + 2;
2440}
2441
2442int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2443 if (!ST.hasGFX90AInsts())
2444 return 0;
2445
2446 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2447 return isDGEMM(MI.getOpcode());
2448 };
2449
2450 // This is checked in checkMAIHazards90A()
2451 if (SIInstrInfo::isMFMA(*MI))
2452 return 0;
2453
2454 const MachineRegisterInfo &MRI = MF.getRegInfo();
2455
2456 int WaitStatesNeeded = 0;
2457
2458 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2461 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2462 bool IsVALU = SIInstrInfo::isVALU(*MI);
2463
2464 const MachineInstr *MFMA = nullptr;
2465 unsigned Reg;
2466 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2467 if (!SIInstrInfo::isMFMA(MI) ||
2468 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2469 return false;
2470 MFMA = &MI;
2471 return true;
2472 };
2473
2474 const MachineInstr *DOT = nullptr;
2475 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2476 if (!SIInstrInfo::isDOT(MI) ||
2477 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2478 return false;
2479 DOT = &MI;
2480 return true;
2481 };
2482
2483 bool DGEMMAfterVALUWrite = false;
2484 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2485 // Found DGEMM on reverse traversal to def.
2486 if (isDGEMM(MI.getOpcode()))
2487 DGEMMAfterVALUWrite = true;
2488
2489 // Only hazard if register is defined by a VALU and a DGEMM is found after
2490 // after the def.
2491 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2492 return false;
2493
2494 return true;
2495 };
2496
2497 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2498 AMDGPU::OpName::src2);
2499
2500 if (IsMemOrExport || IsVALU) {
2501 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2502 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2503 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2504 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2505 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2506 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2507 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2508 const int DotWriteSameDotReadSrcAB = 3;
2509 const int DotWriteDifferentVALURead = 3;
2510 const int DMFMABetweenVALUWriteVMEMRead = 2;
2511 const int MaxWaitStates = 19;
2512
2513 for (const MachineOperand &Use : MI->explicit_uses()) {
2514 if (!Use.isReg())
2515 continue;
2516 Reg = Use.getReg();
2517
2518 DOT = nullptr;
2519 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2520 MaxWaitStates);
2521 if (DOT) {
2522 int NeedWaitStates = 0;
2523 if (DOT->getOpcode() == MI->getOpcode()) {
2524 if (&Use - &MI->getOperand(0) != SrcCIdx)
2525 NeedWaitStates = DotWriteSameDotReadSrcAB;
2526 } else {
2527 NeedWaitStates = DotWriteDifferentVALURead;
2528 }
2529
2530 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2532 }
2533
2534 // Workaround for HW data hazard bug observed only in GFX90A. When there
2535 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2536 // causes the SQ to incorrectly not insert two wait states between the two
2537 // instructions needed to avoid data hazard.
2538 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2539 DGEMMAfterVALUWrite = false;
2540 if (TRI.isVectorRegister(MRI, Reg)) {
2541 int WaitStatesNeededForUse =
2542 DMFMABetweenVALUWriteVMEMRead -
2543 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2544 DMFMABetweenVALUWriteVMEMRead);
2545
2546 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2547 }
2548 }
2549
2550 MFMA = nullptr;
2551 WaitStatesSinceDef =
2552 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2553 if (!MFMA)
2554 continue;
2555
2556 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2557 int NumPasses = HazardDefLatency;
2558 int NeedWaitStates = MaxWaitStates;
2559
2560 if (isDGEMM(MFMA->getOpcode())) {
2561 switch (HazardDefLatency) {
2562 case 4:
2563 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2564 : DMFMA4x4WriteVgprVALUReadWaitStates;
2565 break;
2566 case 8:
2567 case 16:
2568 NeedWaitStates = IsMemOrExport
2569 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2570 : DMFMA16x16WriteVgprVALUReadWaitStates;
2571 break;
2572 default:
2573 llvm_unreachable("unexpected dgemm");
2574 }
2575 } else if (ST.hasGFX940Insts()) {
2576 NeedWaitStates =
2577 isXDL(ST, *MFMA)
2580 NumPasses);
2581 } else {
2582 switch (HazardDefLatency) {
2583 case 2:
2584 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2585 break;
2586 case 8:
2587 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2588 break;
2589 case 16:
2590 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2591 break;
2592 default:
2593 llvm_unreachable("unexpected number of passes for mfma");
2594 }
2595 }
2596
2597 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2599
2600 if (WaitStatesNeeded == MaxWaitStates)
2601 break;
2602 }
2603 }
2604
2605 unsigned Opc = MI->getOpcode();
2606 const int DMFMAToFMA64WaitStates = 2;
2607 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2608 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2609 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2610 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2611 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2612 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2613 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2614 }
2615
2616 if (!IsVALU && !IsMemOrExport)
2617 return WaitStatesNeeded;
2618
2619 for (const MachineOperand &Def : MI->defs()) {
2620 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2621 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2622 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2623 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2624 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2625 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2626 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2627 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2628 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2629 const int DotWriteDifferentVALUWrite = 3;
2630 const int MaxWaitStates = 19;
2631 const int MaxWarWaitStates = 15;
2632
2633 Reg = Def.getReg();
2634
2635 DOT = nullptr;
2636 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2637 MaxWaitStates);
2638 if (DOT && DOT->getOpcode() != MI->getOpcode())
2639 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2640 WaitStatesSinceDef);
2641
2642 MFMA = nullptr;
2643 WaitStatesSinceDef =
2644 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2645 if (MFMA) {
2646 int NeedWaitStates = MaxWaitStates;
2647 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2648
2649 if (isDGEMM(MFMA->getOpcode())) {
2650 switch (NumPasses) {
2651 case 4:
2652 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2653 break;
2654 case 8:
2655 case 16:
2656 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2657 break;
2658 default:
2659 llvm_unreachable("unexpected number of cycles for dgemm");
2660 }
2661 } else if (ST.hasGFX940Insts()) {
2662 NeedWaitStates =
2663 isXDL(ST, *MFMA)
2666 } else {
2667 switch (NumPasses) {
2668 case 2:
2669 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2670 break;
2671 case 8:
2672 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2673 break;
2674 case 16:
2675 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2676 break;
2677 default:
2678 llvm_unreachable("Unexpected number of passes for mfma");
2679 }
2680 }
2681
2682 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2683 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2684
2685 if (WaitStatesNeeded == MaxWaitStates)
2686 break;
2687 }
2688
2689 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2690 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2691 !MI.readsRegister(Reg, &TRI))
2692 return false;
2693
2694 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2695 return false;
2696
2697 const MachineOperand *SrcC =
2698 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2699 assert(SrcC);
2700 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2701 return false;
2702
2703 MFMA = &MI;
2704 return true;
2705 };
2706
2707 MFMA = nullptr;
2708 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2709 MaxWarWaitStates);
2710 if (!MFMA)
2711 continue;
2712
2713 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2714 int NeedWaitStates = MaxWaitStates;
2715 switch (HazardDefLatency) {
2716 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2717 break;
2718 case 4: assert(ST.hasGFX940Insts());
2719 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2720 break;
2721 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2722 break;
2723 case 16: [[fallthrough]];
2724 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2725 break;
2726 }
2727
2728 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2729 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2730 }
2731
2732 return WaitStatesNeeded;
2733}
2734
2736 if (!SU->isInstr())
2737 return false;
2738
2739 const MachineInstr *MAI = nullptr;
2740
2741 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2742 MAI = nullptr;
2744 MAI = &MI;
2745 return MAI != nullptr;
2746 };
2747
2748 MachineInstr *MI = SU->getInstr();
2749 if (IsMFMAFn(*MI)) {
2750 int W = getWaitStatesSince(IsMFMAFn, 16);
2751 if (MAI)
2752 return W < (int)TSchedModel.computeInstrLatency(MAI);
2753 }
2754
2755 return false;
2756}
2757
2758bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2759 if (!ST.hasVALUMaskWriteHazard())
2760 return false;
2762
2763 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2764 return false;
2765
2766 // The hazard sequence is three instructions:
2767 // 1. VALU reads SGPR as mask
2768 // 2. SALU writes SGPR
2769 // 3. SALU reads SGPR
2770 // The hazard can expire if the distance between 2 and 3 is sufficient.
2771 // In practice this happens <10% of the time, hence this always assumes
2772 // the hazard exists if 1 and 2 are present to avoid searching.
2773
2774 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2775 if (!SDSTOp || !SDSTOp->isReg())
2776 return false;
2777
2778 const Register HazardReg = SDSTOp->getReg();
2779 if (HazardReg == AMDGPU::EXEC ||
2780 HazardReg == AMDGPU::EXEC_LO ||
2781 HazardReg == AMDGPU::EXEC_HI ||
2782 HazardReg == AMDGPU::M0)
2783 return false;
2784
2785 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2786 switch (I.getOpcode()) {
2787 case AMDGPU::V_ADDC_U32_e32:
2788 case AMDGPU::V_ADDC_U32_dpp:
2789 case AMDGPU::V_CNDMASK_B16_e32:
2790 case AMDGPU::V_CNDMASK_B16_dpp:
2791 case AMDGPU::V_CNDMASK_B32_e32:
2792 case AMDGPU::V_CNDMASK_B32_dpp:
2793 case AMDGPU::V_DIV_FMAS_F32_e64:
2794 case AMDGPU::V_DIV_FMAS_F64_e64:
2795 case AMDGPU::V_SUBB_U32_e32:
2796 case AMDGPU::V_SUBB_U32_dpp:
2797 case AMDGPU::V_SUBBREV_U32_e32:
2798 case AMDGPU::V_SUBBREV_U32_dpp:
2799 // These implicitly read VCC as mask source.
2800 return HazardReg == AMDGPU::VCC ||
2801 HazardReg == AMDGPU::VCC_LO ||
2802 HazardReg == AMDGPU::VCC_HI;
2803 case AMDGPU::V_ADDC_U32_e64:
2804 case AMDGPU::V_ADDC_U32_e64_dpp:
2805 case AMDGPU::V_CNDMASK_B16_e64:
2806 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2807 case AMDGPU::V_CNDMASK_B32_e64:
2808 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2809 case AMDGPU::V_SUBB_U32_e64:
2810 case AMDGPU::V_SUBB_U32_e64_dpp:
2811 case AMDGPU::V_SUBBREV_U32_e64:
2812 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2813 // Only check mask register overlaps.
2814 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2815 assert(SSRCOp);
2816 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2817 }
2818 default:
2819 return false;
2820 }
2821 };
2822
2823 const MachineRegisterInfo &MRI = MF.getRegInfo();
2824 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2825 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2826 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2827 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2828 return true;
2829
2830 // VALU access to any SGPR or literal constant other than HazardReg
2831 // mitigates hazard. No need to check HazardReg here as this will
2832 // only be called when !IsHazardFn.
2833 if (!SIInstrInfo::isVALU(I))
2834 return false;
2835 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2836 const MachineOperand &Op = I.getOperand(OpNo);
2837 if (Op.isReg()) {
2838 Register OpReg = Op.getReg();
2839 // Only consider uses
2840 if (!Op.isUse())
2841 continue;
2842 // Ignore EXEC
2843 if (OpReg == AMDGPU::EXEC ||
2844 OpReg == AMDGPU::EXEC_LO ||
2845 OpReg == AMDGPU::EXEC_HI)
2846 continue;
2847 // Ignore all implicit uses except VCC
2848 if (Op.isImplicit()) {
2849 if (OpReg == AMDGPU::VCC ||
2850 OpReg == AMDGPU::VCC_LO ||
2851 OpReg == AMDGPU::VCC_HI)
2852 return true;
2853 continue;
2854 }
2855 if (TRI.isSGPRReg(MRI, OpReg))
2856 return true;
2857 } else {
2858 const MCInstrDesc &InstDesc = I.getDesc();
2859 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2860 if (!TII.isInlineConstant(Op, OpInfo))
2861 return true;
2862 }
2863 }
2864 return false;
2865 };
2866
2867 // Check for hazard
2868 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2869 std::numeric_limits<int>::max())
2870 return false;
2871
2872 auto NextMI = std::next(MI->getIterator());
2873
2874 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2875 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2876 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2878
2879 // SALU write may be s_getpc in a bundle.
2880 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2881 // Update offsets of any references in the bundle.
2882 while (NextMI != MI->getParent()->end() &&
2883 NextMI->isBundledWithPred()) {
2884 for (auto &Operand : NextMI->operands()) {
2885 if (Operand.isGlobal())
2886 Operand.setOffset(Operand.getOffset() + 4);
2887 }
2888 NextMI++;
2889 }
2890 }
2891
2892 return true;
2893}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
@ HazardExpired
@ NoHazardFound
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:263
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:875
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:494
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:590
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:479
bool hasRFEHazards() const
Definition: GCNSubtarget.h:489
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:485
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:784
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:805
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:821
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:800
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:916
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:68
Definition: regcomp.c:192