LLVM 22.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (MI.getOperand(GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
161}
162
163static bool isLdsDma(const MachineInstr &MI) {
164 return SIInstrInfo::isVALU(MI) &&
166}
167
168static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
169 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
170 AMDGPU::OpName::simm16);
171 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
172}
173
176 MachineInstr *MI = SU->getInstr();
177 // If we are not in "HazardRecognizerMode" and therefore not being run from
178 // the scheduler, track possible stalls from hazards but don't insert noops.
179 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
180
181 if (MI->isBundle())
182 return NoHazard;
183
184 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
185 return HazardType;
186
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
188 return HazardType;
189
190 if (checkFPAtomicToDenormModeHazard(MI) > 0)
191 return HazardType;
192
193 // Hazards which cannot be mitigated with S_NOPs.
194 if (!IsHazardRecognizerMode) {
195 if (checkWMMACoexecutionHazards(MI) > 0)
196 return Hazard;
197 }
198
199 if (ST.hasNoDataDepHazard())
200 return NoHazard;
201
202 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
203 return HazardType;
204
205 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
206 return HazardType;
207
208 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
209 return HazardType;
210
211 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
212 return HazardType;
213
214 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
215 return HazardType;
216
219 checkMAIVALUHazards(MI) > 0)
220 return HazardType;
221
222 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
223 return HazardType;
224
225 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
226 return HazardType;
227
228 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
229 return HazardType;
230
231 if (((ST.hasReadM0MovRelInterpHazard() &&
232 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
233 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
234 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
235 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
236 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
237 (ST.hasReadM0LdsDirectHazard() &&
238 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
239 checkReadM0Hazards(MI) > 0)
240 return HazardType;
241
242 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
243 return HazardType;
244
246 checkMAILdStHazards(MI) > 0)
247 return HazardType;
248
249 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
250 return HazardType;
251
252 return NoHazard;
253}
254
256 unsigned Quantity) {
257 while (Quantity > 0) {
258 unsigned Arg = std::min(Quantity, 8u);
259 Quantity -= Arg;
260 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
261 .addImm(Arg - 1);
262 }
263}
264
265unsigned
266GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
267 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
268 assert(TSchedModel.getWriteProcResBegin(SC) !=
269 TSchedModel.getWriteProcResEnd(SC));
270 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
271}
272
273void GCNHazardRecognizer::processBundle() {
274 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
275 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
276 // Check bundled MachineInstr's for hazards.
277 for (; MI != E && MI->isInsideBundle(); ++MI) {
278 CurrCycleInstr = &*MI;
279 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
280
281 if (IsHazardRecognizerMode) {
282 fixHazards(CurrCycleInstr);
283
284 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
285 }
286
287 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
288 // include the bundled MI directly after, only add a maximum of
289 // (MaxLookAhead - 1) noops to EmittedInstrs.
290 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
291 EmittedInstrs.push_front(nullptr);
292
293 EmittedInstrs.push_front(CurrCycleInstr);
294 EmittedInstrs.resize(MaxLookAhead);
295 }
296 CurrCycleInstr = nullptr;
297}
298
299void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
300 assert(IsHazardRecognizerMode);
301
302 unsigned NumPreNoops = PreEmitNoops(MI);
303 EmitNoops(NumPreNoops);
304 if (MI->isInsideBundle())
305 insertNoopsInBundle(MI, TII, NumPreNoops);
306 else
307 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
308 NumPreNoops);
310 AdvanceCycle();
311}
312
314 IsHazardRecognizerMode = true;
315 CurrCycleInstr = MI;
316 unsigned W = PreEmitNoopsCommon(MI);
317 fixHazards(MI);
318 CurrCycleInstr = nullptr;
319 return std::max(W, NopPadding.getValue());
320}
321
323 if (MI->isBundle())
324 return 0;
325
326 int WaitStates = 0;
327
329 return std::max(WaitStates, checkSMRDHazards(MI));
330
331 if (ST.hasNSAtoVMEMBug())
332 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
333
334 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
335
336 if (ST.hasNoDataDepHazard())
337 return WaitStates;
338
340 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
341
343 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
344
346 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
347
348 if (isDivFMas(MI->getOpcode()))
349 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
350
351 if (isRWLane(MI->getOpcode()))
352 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
353
356 checkMAIVALUHazards(MI) > 0)
357 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
358
359 if (MI->isInlineAsm())
360 return std::max(WaitStates, checkInlineAsmHazards(MI));
361
362 if (isSGetReg(MI->getOpcode()))
363 return std::max(WaitStates, checkGetRegHazards(MI));
364
365 if (isSSetReg(MI->getOpcode()))
366 return std::max(WaitStates, checkSetRegHazards(MI));
367
368 if (isRFE(MI->getOpcode()))
369 return std::max(WaitStates, checkRFEHazards(MI));
370
371 if ((ST.hasReadM0MovRelInterpHazard() &&
372 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
373 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
374 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
375 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
376 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
377 (ST.hasReadM0LdsDirectHazard() &&
378 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
379 return std::max(WaitStates, checkReadM0Hazards(MI));
380
382 return std::max(WaitStates, checkMAIHazards(MI));
383
385 return std::max(WaitStates, checkMAILdStHazards(MI));
386
387 if (ST.hasGFX950Insts() && isPermlane(*MI))
388 return std::max(WaitStates, checkPermlaneHazards(MI));
389
390 return WaitStates;
391}
392
394 EmittedInstrs.push_front(nullptr);
395}
396
398 // When the scheduler detects a stall, it will call AdvanceCycle() without
399 // emitting any instructions.
400 if (!CurrCycleInstr) {
401 EmittedInstrs.push_front(nullptr);
402 return;
403 }
404
405 if (CurrCycleInstr->isBundle()) {
406 processBundle();
407 return;
408 }
409
410 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
411 if (!NumWaitStates) {
412 CurrCycleInstr = nullptr;
413 return;
414 }
415
416 // Keep track of emitted instructions
417 EmittedInstrs.push_front(CurrCycleInstr);
418
419 // Add a nullptr for each additional wait state after the first. Make sure
420 // not to add more than getMaxLookAhead() items to the list, since we
421 // truncate the list to that size right after this loop.
422 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
423 i < e; ++i) {
424 EmittedInstrs.push_front(nullptr);
425 }
426
427 // getMaxLookahead() is the largest number of wait states we will ever need
428 // to insert, so there is no point in keeping track of more than that many
429 // wait states.
430 EmittedInstrs.resize(getMaxLookAhead());
431
432 CurrCycleInstr = nullptr;
433}
434
436 assert(!IsHazardRecognizerMode &&
437 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
438}
439
440//===----------------------------------------------------------------------===//
441// Helper Functions
442//===----------------------------------------------------------------------===//
443
445
446// Search for a hazard in a block and its predecessors.
447template <typename StateT>
448static bool
449hasHazard(StateT InitialState,
450 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
451 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
452 const MachineBasicBlock *InitialMBB,
454 struct StateMapKey {
456 unsigned Idx;
457 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
458 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
459 }
460 };
461 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
462 static inline StateMapKey getEmptyKey() {
463 return {static_cast<SmallVectorImpl<StateT> *>(
466 }
467 static inline StateMapKey getTombstoneKey() {
468 return {static_cast<SmallVectorImpl<StateT> *>(
471 }
472 static unsigned getHashValue(const StateMapKey &Key) {
473 return StateT::getHashValue((*Key.States)[Key.Idx]);
474 }
475 static unsigned getHashValue(const StateT &State) {
476 return StateT::getHashValue(State);
477 }
478 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
479 const auto EKey = getEmptyKey();
480 const auto TKey = getTombstoneKey();
481 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
482 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
483 return StateMapKey::isEqual(LHS, RHS);
484 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
485 }
486 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
487 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
488 StateMapKey::isEqual(RHS, getTombstoneKey()))
489 return false;
490 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
491 }
492 };
493
496
498 const MachineBasicBlock *MBB = InitialMBB;
499 StateT State = InitialState;
500
502 unsigned WorkIdx = 0;
503 for (;;) {
504 bool Expired = false;
505 for (auto E = MBB->instr_rend(); I != E; ++I) {
506 // No need to look at parent BUNDLE instructions.
507 if (I->isBundle())
508 continue;
509
510 auto Result = IsHazard(State, *I);
511 if (Result == HazardFound)
512 return true;
513 if (Result == HazardExpired) {
514 Expired = true;
515 break;
516 }
517
518 if (I->isInlineAsm() || I->isMetaInstruction())
519 continue;
520
521 UpdateState(State, *I);
522 }
523
524 if (!Expired) {
525 unsigned StateIdx = States.size();
526 StateMapKey Key = {&States, StateIdx};
527 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
528 if (Insertion.second) {
529 States.emplace_back(State);
530 } else {
531 StateIdx = Insertion.first->second;
532 }
533 for (MachineBasicBlock *Pred : MBB->predecessors())
534 Worklist.insert(std::pair(Pred, StateIdx));
535 }
536
537 if (WorkIdx == Worklist.size())
538 break;
539
540 unsigned StateIdx;
541 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
542 State = States[StateIdx];
543 I = MBB->instr_rbegin();
544 }
545
546 return false;
547}
548
549// Returns a minimum wait states since \p I walking all predecessors.
550// Only scans until \p IsExpired does not return true.
551// Can only be run in a hazard recognizer mode.
552static int
554 const MachineBasicBlock *MBB,
556 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
560 for (auto E = MBB->instr_rend(); I != E; ++I) {
561 // Don't add WaitStates for parent BUNDLE instructions.
562 if (I->isBundle())
563 continue;
564
565 if (IsHazard(*I))
566 return WaitStates;
567
568 if (I->isInlineAsm())
569 continue;
570
571 WaitStates += GetNumWaitStates(*I);
572
573 if (IsExpired(*I, WaitStates))
574 return std::numeric_limits<int>::max();
575 }
576
577 int MinWaitStates = std::numeric_limits<int>::max();
578 for (MachineBasicBlock *Pred : MBB->predecessors()) {
579 if (!Visited.insert(Pred).second)
580 continue;
581
582 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
583 IsExpired, Visited, GetNumWaitStates);
584
585 MinWaitStates = std::min(MinWaitStates, W);
586 }
587
588 return MinWaitStates;
589}
590
591static int
593 const MachineInstr *MI,
598 return getWaitStatesSince(IsHazard, MI->getParent(),
599 std::next(MI->getReverseIterator()), 0, IsExpired,
600 Visited, GetNumWaitStates);
601}
602
603int GCNHazardRecognizer::getWaitStatesSince(
604 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
605 if (IsHazardRecognizerMode) {
606 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
607 return WaitStates >= Limit;
608 };
609 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
610 GetNumWaitStates);
611 }
612
613 int WaitStates = 0;
614 for (MachineInstr *MI : EmittedInstrs) {
615 if (MI) {
616 if (IsHazard(*MI))
617 return WaitStates;
618
619 if (MI->isInlineAsm())
620 continue;
621 }
622 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
623
624 if (WaitStates >= Limit)
625 break;
626 }
627 return std::numeric_limits<int>::max();
628}
629
630int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
631 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
632}
633
634int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
635 IsHazardFn IsHazardDef,
636 int Limit) {
637 const SIRegisterInfo *TRI = ST.getRegisterInfo();
638
639 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
640 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
641 };
642
643 return getWaitStatesSince(IsHazardFn, Limit);
644}
645
646int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
647 int Limit) {
648 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
649 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
650 };
651
652 return getWaitStatesSince(IsHazardFn, Limit);
653}
654
655//===----------------------------------------------------------------------===//
656// No-op Hazard Detection
657//===----------------------------------------------------------------------===//
658
659static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
660 MCRegister Reg) {
661 for (MCRegUnit Unit : TRI.regunits(Reg))
662 BV.set(static_cast<unsigned>(Unit));
663}
664
665static void addRegsToSet(const SIRegisterInfo &TRI,
667 BitVector &DefSet, BitVector &UseSet) {
668 for (const MachineOperand &Op : Ops) {
669 if (Op.isReg())
670 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
671 }
672}
673
674void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
675 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
676}
677
679 return !SIInstrInfo::isSMRD(*MI);
680}
681
683 return !SIInstrInfo::isVMEM(*MI);
684}
685
686int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
687 // SMEM soft clause are only present on VI+, and only matter if xnack is
688 // enabled.
689 if (!ST.isXNACKEnabled())
690 return 0;
691
692 bool IsSMRD = TII.isSMRD(*MEM);
693
694 resetClause();
695
696 // A soft-clause is any group of consecutive SMEM instructions. The
697 // instructions in this group may return out of order and/or may be
698 // replayed (i.e. the same instruction issued more than once).
699 //
700 // In order to handle these situations correctly we need to make sure that
701 // when a clause has more than one instruction, no instruction in the clause
702 // writes to a register that is read by another instruction in the clause
703 // (including itself). If we encounter this situation, we need to break the
704 // clause by inserting a non SMEM instruction.
705
706 for (MachineInstr *MI : EmittedInstrs) {
707 // When we hit a non-SMEM instruction then we have passed the start of the
708 // clause and we can stop.
709 if (!MI)
710 break;
711
713 break;
714
715 addClauseInst(*MI);
716 }
717
718 if (ClauseDefs.none())
719 return 0;
720
721 // We need to make sure not to put loads and stores in the same clause if they
722 // use the same address. For now, just start a new clause whenever we see a
723 // store.
724 if (MEM->mayStore())
725 return 1;
726
727 addClauseInst(*MEM);
728
729 // If the set of defs and uses intersect then we cannot add this instruction
730 // to the clause, so we have a hazard.
731 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
732}
733
734int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
735 int WaitStatesNeeded = 0;
736
737 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
738
739 // This SMRD hazard only affects SI.
740 if (!ST.hasSMRDReadVALUDefHazard())
741 return WaitStatesNeeded;
742
743 // A read of an SGPR by SMRD instruction requires 4 wait states when the
744 // SGPR was written by a VALU instruction.
745 int SmrdSgprWaitStates = 4;
746 auto IsHazardDefFn = [this](const MachineInstr &MI) {
747 return TII.isVALU(MI);
748 };
749 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
750 return TII.isSALU(MI);
751 };
752
753 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
754
755 for (const MachineOperand &Use : SMRD->uses()) {
756 if (!Use.isReg())
757 continue;
758 int WaitStatesNeededForUse =
759 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
760 SmrdSgprWaitStates);
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762
763 // This fixes what appears to be undocumented hardware behavior in SI where
764 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
765 // needs some number of nops in between. We don't know how many we need, but
766 // let's use 4. This wasn't discovered before probably because the only
767 // case when this happens is when we expand a 64-bit pointer into a full
768 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
769 // probably never encountered in the closed-source land.
770 if (IsBufferSMRD) {
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
773 IsBufferHazardDefFn,
774 SmrdSgprWaitStates);
775 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
776 }
777 }
778
779 return WaitStatesNeeded;
780}
781
782int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
783 if (!ST.hasVMEMReadSGPRVALUDefHazard())
784 return 0;
785
786 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
787
788 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
789 // SGPR was written by a VALU Instruction.
790 const int VmemSgprWaitStates = 5;
791 auto IsHazardDefFn = [this](const MachineInstr &MI) {
792 return TII.isVALU(MI);
793 };
794 for (const MachineOperand &Use : VMEM->uses()) {
795 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
796 continue;
797
798 int WaitStatesNeededForUse =
799 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
800 VmemSgprWaitStates);
801 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
802 }
803 return WaitStatesNeeded;
804}
805
806int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
807 const SIRegisterInfo *TRI = ST.getRegisterInfo();
808 const SIInstrInfo *TII = ST.getInstrInfo();
809
810 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
811 int DppVgprWaitStates = 2;
812 int DppExecWaitStates = 5;
813 int WaitStatesNeeded = 0;
814 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
815 return TII->isVALU(MI);
816 };
817
818 for (const MachineOperand &Use : DPP->uses()) {
819 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
820 continue;
821 int WaitStatesNeededForUse =
822 DppVgprWaitStates - getWaitStatesSinceDef(
823 Use.getReg(),
824 [](const MachineInstr &) { return true; },
825 DppVgprWaitStates);
826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
827 }
828
829 WaitStatesNeeded = std::max(
830 WaitStatesNeeded,
831 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
832 DppExecWaitStates));
833
834 return WaitStatesNeeded;
835}
836
837int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
838 const SIInstrInfo *TII = ST.getInstrInfo();
839
840 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
841 // instruction.
842 const int DivFMasWaitStates = 4;
843 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
844 return TII->isVALU(MI);
845 };
846 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
847 DivFMasWaitStates);
848
849 return DivFMasWaitStates - WaitStatesNeeded;
850}
851
852int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
853 const SIInstrInfo *TII = ST.getInstrInfo();
854 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
855
856 const int GetRegWaitStates = 2;
857 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
858 return GetRegHWReg == getHWReg(TII, MI);
859 };
860 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
861
862 return GetRegWaitStates - WaitStatesNeeded;
863}
864
865int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned HWReg = getHWReg(TII, *SetRegInstr);
868
869 const int SetRegWaitStates = ST.getSetRegWaitStates();
870 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
871 return HWReg == getHWReg(TII, MI);
872 };
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
874 return SetRegWaitStates - WaitStatesNeeded;
875}
876
877int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
878 if (!MI.mayStore())
879 return -1;
880
881 const SIInstrInfo *TII = ST.getInstrInfo();
882 unsigned Opcode = MI.getOpcode();
883 const MCInstrDesc &Desc = MI.getDesc();
884
885 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
886 int VDataRCID = -1;
887 if (VDataIdx != -1)
888 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
889
890 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
891 // There is no hazard if the instruction does not use vector regs
892 // (like wbinvl1)
893 if (VDataIdx == -1)
894 return -1;
895 // For MUBUF/MTBUF instructions this hazard only exists if the
896 // instruction is not using a register in the soffset field.
897 const MachineOperand *SOffset =
898 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
899 // If we have no soffset operand, then assume this field has been
900 // hardcoded to zero.
901 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
902 (!SOffset || !SOffset->isReg()))
903 return VDataIdx;
904 }
905
906 // MIMG instructions create a hazard if they don't use a 256-bit T# and
907 // the store size is greater than 8 bytes and they have more than two bits
908 // of their dmask set.
909 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
910 if (TII->isMIMG(MI)) {
911 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
912 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
913 Desc.operands()[SRsrcIdx])) == 256);
914 (void)SRsrcIdx;
915 }
916
917 if (TII->isFLAT(MI)) {
918 // There is no hazard if the instruction does not use vector regs
919 if (VDataIdx == -1)
920 return -1;
921
922 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
923 return VDataIdx;
924 }
925
926 return -1;
927}
928
929int
930GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
931 const MachineRegisterInfo &MRI) {
932 // Helper to check for the hazard where VMEM instructions that store more than
933 // 8 bytes can have there store data over written by the next instruction.
934 const SIRegisterInfo *TRI = ST.getRegisterInfo();
935
936 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
937 int WaitStatesNeeded = 0;
938
939 if (!TRI->isVectorRegister(MRI, Def.getReg()))
940 return WaitStatesNeeded;
941 Register Reg = Def.getReg();
942 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
943 int DataIdx = createsVALUHazard(MI);
944 return DataIdx >= 0 &&
945 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
946 };
947
948 int WaitStatesNeededForDef =
949 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
950 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
951
952 return WaitStatesNeeded;
953}
954
955/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
956/// pack the computed value into correct bit position of the dest register. This
957/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
958/// dst_sel that is not aligned to the register. This function analayzes the \p
959/// MI and \returns an operand with dst forwarding issue, or nullptr if
960/// none exists.
961static const MachineOperand *
964 return nullptr;
965
966 const SIInstrInfo *TII = ST.getInstrInfo();
967
968 unsigned Opcode = MI.getOpcode();
969
970 // There are three different types of instructions
971 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
972 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
973 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
974 // op_sel[3:2]
975 // != 0
976 if (SIInstrInfo::isSDWA(MI)) {
977 // Type 1: SDWA with dst_sel != DWORD
978 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
979 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
980 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
981 }
982
983 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
984 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
985 // Type 2: VOP3 which write the hi bits
986 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
988 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
989
990 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
991 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
992 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
994 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
995 }
996
997 // Special case: nop is required for all the opsel values for fp4 sr variant
998 // cvt scale instructions
999 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1000 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1001
1002 return nullptr;
1003}
1004
1005/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1006/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1007/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1009 const MachineOperand *Dst,
1010 const SIRegisterInfo *TRI) {
1011 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1012 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1013 // and we must account for that hazard.
1014 // We also must account for WAW hazards. In particular, WAW with dest
1015 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1016 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1017 // check for ECC. Without accounting for this hazard, the ECC will be
1018 // wrong.
1019 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1020 // complete zeroesHigh16BitsOfDest)
1021 for (auto &Operand : VALU->operands()) {
1022 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1023 return true;
1024 }
1025 }
1026 return false;
1027}
1028
1029int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1030 int WaitStatesNeeded = 0;
1031
1032 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1033 const int TransDefWaitstates = 1;
1034
1035 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1037 return false;
1038 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1039 const SIInstrInfo *TII = ST.getInstrInfo();
1040 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1041
1042 for (const MachineOperand &Use : VALU->explicit_uses()) {
1043 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1044 return true;
1045 }
1046
1047 return false;
1048 };
1049
1050 int WaitStatesNeededForDef =
1051 TransDefWaitstates -
1052 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1054 }
1055
1056 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1057 const int Shift16DefWaitstates = 1;
1058
1059 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1060 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1061 const MachineOperand *ForwardedDst =
1062 getDstSelForwardingOperand(ProducerMI, ST);
1063 if (ForwardedDst) {
1064 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1065 }
1066
1067 if (ProducerMI.isInlineAsm()) {
1068 // Assume inline asm has dst forwarding hazard
1069 for (auto &Def : ProducerMI.all_defs()) {
1070 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1071 return true;
1072 }
1073 }
1074
1075 return false;
1076 };
1077
1078 int WaitStatesNeededForDef =
1079 Shift16DefWaitstates -
1080 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1081 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1082 }
1083
1084 if (ST.hasVDecCoExecHazard()) {
1085 const int VALUWriteSGPRVALUReadWaitstates = 2;
1086 const int VALUWriteEXECRWLane = 4;
1087 const int VALUWriteVGPRReadlaneRead = 1;
1088
1089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1090 const MachineRegisterInfo &MRI = MF.getRegInfo();
1092 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1093 if (!SIInstrInfo::isVALU(MI))
1094 return false;
1095 return MI.modifiesRegister(UseReg, TRI);
1096 };
1097
1098 for (const MachineOperand &Use : VALU->explicit_uses()) {
1099 if (!Use.isReg())
1100 continue;
1101
1102 UseReg = Use.getReg();
1103 if (TRI->isSGPRReg(MRI, UseReg)) {
1104 int WaitStatesNeededForDef =
1105 VALUWriteSGPRVALUReadWaitstates -
1106 getWaitStatesSince(IsVALUDefSGPRFn,
1107 VALUWriteSGPRVALUReadWaitstates);
1108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1109 }
1110 }
1111
1112 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1113 UseReg = AMDGPU::VCC;
1114 int WaitStatesNeededForDef =
1115 VALUWriteSGPRVALUReadWaitstates -
1116 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1117 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1118 }
1119
1120 switch (VALU->getOpcode()) {
1121 case AMDGPU::V_READLANE_B32:
1122 case AMDGPU::V_READFIRSTLANE_B32: {
1123 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1124 UseReg = Src->getReg();
1125 int WaitStatesNeededForDef =
1126 VALUWriteVGPRReadlaneRead -
1127 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1129 }
1130 [[fallthrough]];
1131 case AMDGPU::V_WRITELANE_B32: {
1132 UseReg = AMDGPU::EXEC;
1133 int WaitStatesNeededForDef =
1134 VALUWriteEXECRWLane -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1137 break;
1138 }
1139 default:
1140 break;
1141 }
1142 }
1143
1144 // This checks for the hazard where VMEM instructions that store more than
1145 // 8 bytes can have there store data over written by the next instruction.
1146 if (!ST.has12DWordStoreHazard())
1147 return WaitStatesNeeded;
1148
1149 const MachineRegisterInfo &MRI = MF.getRegInfo();
1150
1151 for (const MachineOperand &Def : VALU->defs()) {
1152 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1153 }
1154
1155 return WaitStatesNeeded;
1156}
1157
1158int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1159 // This checks for hazards associated with inline asm statements.
1160 // Since inline asms can contain just about anything, we use this
1161 // to call/leverage other check*Hazard routines. Note that
1162 // this function doesn't attempt to address all possible inline asm
1163 // hazards (good luck), but is a collection of what has been
1164 // problematic thus far.
1165
1166 // see checkVALUHazards()
1167 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1168 !ST.hasCvtScaleForwardingHazard())
1169 return 0;
1170
1171 const MachineRegisterInfo &MRI = MF.getRegInfo();
1172 int WaitStatesNeeded = 0;
1173
1174 for (const MachineOperand &Op :
1176 if (Op.isReg() && Op.isDef()) {
1177 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1178 continue;
1179
1180 if (ST.has12DWordStoreHazard()) {
1181 WaitStatesNeeded =
1182 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1183 }
1184 }
1185 }
1186
1187 if (ST.hasDstSelForwardingHazard()) {
1188 const int Shift16DefWaitstates = 1;
1189
1190 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1191 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1192 // Assume inline asm reads the dst
1193 if (Dst)
1194 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1195 IA->readsRegister(Dst->getReg(), &TRI);
1196
1197 if (ProducerMI.isInlineAsm()) {
1198 // If MI is inline asm, assume it has dst forwarding hazard
1199 for (auto &Def : ProducerMI.all_defs()) {
1200 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1201 IA->readsRegister(Def.getReg(), &TRI)) {
1202 return true;
1203 }
1204 }
1205 }
1206
1207 return false;
1208 };
1209
1210 int WaitStatesNeededForDef =
1211 Shift16DefWaitstates -
1212 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1214 }
1215
1216 return WaitStatesNeeded;
1217}
1218
1219int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1220 const SIInstrInfo *TII = ST.getInstrInfo();
1221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1222 const MachineRegisterInfo &MRI = MF.getRegInfo();
1223
1224 const MachineOperand *LaneSelectOp =
1225 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1226
1227 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1228 return 0;
1229
1230 Register LaneSelectReg = LaneSelectOp->getReg();
1231 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1232
1233 const int RWLaneWaitStates = 4;
1234 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1235 RWLaneWaitStates);
1236 return RWLaneWaitStates - WaitStatesSince;
1237}
1238
1239int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1240 if (!ST.hasRFEHazards())
1241 return 0;
1242
1243 const SIInstrInfo *TII = ST.getInstrInfo();
1244
1245 const int RFEWaitStates = 1;
1246
1247 auto IsHazardFn = [TII](const MachineInstr &MI) {
1248 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1249 };
1250 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1251 return RFEWaitStates - WaitStatesNeeded;
1252}
1253
1254int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1255 const SIInstrInfo *TII = ST.getInstrInfo();
1256 const int ReadM0WaitStates = 1;
1257 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1258 return ReadM0WaitStates -
1259 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1260}
1261
1262// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
1263// to insert, negative means not needed.
1264bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
1265 if (WaitStatesNeeded <= 0)
1266 return false;
1267
1268 const SIInstrInfo *TII = ST.getInstrInfo();
1269 for (int I = 0; I < WaitStatesNeeded; ++I)
1270 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1271 TII->get(AMDGPU::V_NOP_e32));
1272
1273 return true;
1274}
1275
1276void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1277 fixVMEMtoScalarWriteHazards(MI);
1278 fixVcmpxPermlaneHazards(MI);
1279 fixSMEMtoVectorWriteHazards(MI);
1280 fixVcmpxExecWARHazard(MI);
1281 fixLdsBranchVmemWARHazard(MI);
1282 if (ST.hasLdsDirect()) {
1283 fixLdsDirectVALUHazard(MI);
1284 fixLdsDirectVMEMHazard(MI);
1285 }
1286 fixVALUPartialForwardingHazard(MI);
1287 fixVALUTransUseHazard(MI);
1288 fixVALUTransCoexecutionHazards(MI);
1289 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1290 emitVNops(MI, checkWMMACoexecutionHazards(MI));
1291 fixShift64HighRegBug(MI);
1292 fixVALUMaskWriteHazard(MI);
1293 fixRequiredExportPriority(MI);
1294 if (ST.requiresWaitIdleBeforeGetReg())
1295 fixGetRegWaitIdle(MI);
1296 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1297 fixDsAtomicAsyncBarrierArriveB64(MI);
1298 if (ST.hasScratchBaseForwardingHazard())
1299 fixScratchBaseForwardingHazard(MI);
1300 if (ST.setRegModeNeedsVNOPs())
1301 fixSetRegMode(MI);
1302}
1303
1305 const MachineInstr &MI) {
1306 return (TII.isVOPC(MI) ||
1307 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1308 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1309}
1310
1311bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1312 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1313 return false;
1314
1315 const SIInstrInfo *TII = ST.getInstrInfo();
1316 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1317 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1318 return isVCmpXWritesExec(*TII, *TRI, MI);
1319 };
1320
1321 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1322 unsigned Opc = MI.getOpcode();
1323 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1324 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1325 };
1326
1327 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1328 std::numeric_limits<int>::max())
1329 return false;
1330
1331 // V_NOP will be discarded by SQ.
1332 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1333 // which is always a VGPR and available.
1334 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1335 Register Reg = Src0->getReg();
1336 bool IsUndef = Src0->isUndef();
1337 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1338 TII->get(AMDGPU::V_MOV_B32_e32))
1339 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1341
1342 return true;
1343}
1344
1345bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1346 if (!ST.hasVMEMtoScalarWriteHazard())
1347 return false;
1348 assert(!ST.hasExtendedWaitCounts());
1349
1351 return false;
1352
1353 if (MI->getNumDefs() == 0)
1354 return false;
1355
1356 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1357
1358 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1360 return false;
1361
1362 for (const MachineOperand &Def : MI->defs()) {
1363 const MachineOperand *Op =
1364 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1365 if (!Op)
1366 continue;
1367 return true;
1368 }
1369 return false;
1370 };
1371
1372 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1373 return SIInstrInfo::isVALU(MI) ||
1374 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1375 !MI.getOperand(0).getImm()) ||
1376 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1377 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1378 };
1379
1380 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1381 std::numeric_limits<int>::max())
1382 return false;
1383
1384 const SIInstrInfo *TII = ST.getInstrInfo();
1385 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1386 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1388 return true;
1389}
1390
1391bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1392 if (!ST.hasSMEMtoVectorWriteHazard())
1393 return false;
1394 assert(!ST.hasExtendedWaitCounts());
1395
1396 if (!SIInstrInfo::isVALU(*MI))
1397 return false;
1398
1399 AMDGPU::OpName SDSTName;
1400 switch (MI->getOpcode()) {
1401 case AMDGPU::V_READLANE_B32:
1402 case AMDGPU::V_READFIRSTLANE_B32:
1403 SDSTName = AMDGPU::OpName::vdst;
1404 break;
1405 default:
1406 SDSTName = AMDGPU::OpName::sdst;
1407 break;
1408 }
1409
1410 const SIInstrInfo *TII = ST.getInstrInfo();
1411 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1412 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1413 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1414 if (!SDST) {
1415 for (const auto &MO : MI->implicit_operands()) {
1416 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1417 SDST = &MO;
1418 break;
1419 }
1420 }
1421 }
1422
1423 if (!SDST)
1424 return false;
1425
1426 const Register SDSTReg = SDST->getReg();
1427 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1428 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1429 };
1430
1431 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1432 if (TII->isSALU(MI)) {
1433 switch (MI.getOpcode()) {
1434 case AMDGPU::S_SETVSKIP:
1435 case AMDGPU::S_VERSION:
1436 case AMDGPU::S_WAITCNT_VSCNT:
1437 case AMDGPU::S_WAITCNT_VMCNT:
1438 case AMDGPU::S_WAITCNT_EXPCNT:
1439 // These instructions cannot not mitigate the hazard.
1440 return false;
1441 case AMDGPU::S_WAITCNT_LGKMCNT:
1442 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1443 return (MI.getOperand(1).getImm() == 0) &&
1444 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1445 case AMDGPU::S_WAITCNT: {
1446 const int64_t Imm = MI.getOperand(0).getImm();
1447 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1448 // DsCnt corresponds to LGKMCnt here.
1449 return (Decoded.DsCnt == 0);
1450 }
1451 default:
1452 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1453 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1454 "unexpected wait count instruction");
1455 // SOPP instructions cannot mitigate the hazard.
1456 if (TII->isSOPP(MI))
1457 return false;
1458 // At this point the SALU can be assumed to mitigate the hazard
1459 // because either:
1460 // (a) it is independent of the at risk SMEM (breaking chain),
1461 // or
1462 // (b) it is dependent on the SMEM, in which case an appropriate
1463 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1464 // SMEM instruction.
1465 return true;
1466 }
1467 }
1468 return false;
1469 };
1470
1471 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1472 std::numeric_limits<int>::max())
1473 return false;
1474
1475 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1476 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1477 .addImm(0);
1478 return true;
1479}
1480
1481bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1482 if (!ST.hasVcmpxExecWARHazard())
1483 return false;
1484 assert(!ST.hasExtendedWaitCounts());
1485
1486 if (!SIInstrInfo::isVALU(*MI))
1487 return false;
1488
1489 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1490 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1491 return false;
1492
1493 auto IsHazardFn = [TRI](const MachineInstr &I) {
1495 return false;
1496 return I.readsRegister(AMDGPU::EXEC, TRI);
1497 };
1498
1499 const SIInstrInfo *TII = ST.getInstrInfo();
1500 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1501 if (SIInstrInfo::isVALU(MI)) {
1502 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1503 return true;
1504 for (auto MO : MI.implicit_operands())
1505 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1506 return true;
1507 }
1508 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1509 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1510 return true;
1511 return false;
1512 };
1513
1514 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1515 std::numeric_limits<int>::max())
1516 return false;
1517
1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1519 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1521 return true;
1522}
1523
1525 const GCNSubtarget &ST) {
1526 if (!ST.hasLdsBranchVmemWARHazard())
1527 return false;
1528
1529 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1530 // instructions need to appear in the same function.
1531 bool HasLds = false;
1532 bool HasVmem = false;
1533 for (auto &MBB : MF) {
1534 for (auto &MI : MBB) {
1536 HasVmem |= SIInstrInfo::isVMEM(MI);
1537 if (HasLds && HasVmem)
1538 return true;
1539 }
1540 }
1541 return false;
1542}
1543
1545 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1546 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1547 !I.getOperand(1).getImm();
1548}
1549
1550bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1551 if (!RunLdsBranchVmemWARHazardFixup)
1552 return false;
1553
1554 assert(ST.hasLdsBranchVmemWARHazard());
1555 assert(!ST.hasExtendedWaitCounts());
1556
1557 auto IsHazardInst = [](const MachineInstr &MI) {
1559 return 1;
1561 return 2;
1562 return 0;
1563 };
1564
1565 auto InstType = IsHazardInst(*MI);
1566 if (!InstType)
1567 return false;
1568
1569 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1570 return IsHazardInst(I) || isStoreCountWaitZero(I);
1571 };
1572
1573 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1574 if (!I.isBranch())
1575 return false;
1576
1577 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1578 auto InstType2 = IsHazardInst(I);
1579 return InstType2 && InstType != InstType2;
1580 };
1581
1582 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1583 auto InstType2 = IsHazardInst(I);
1584 if (InstType == InstType2)
1585 return true;
1586
1587 return isStoreCountWaitZero(I);
1588 };
1589
1590 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1591 std::numeric_limits<int>::max();
1592 };
1593
1594 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1595 std::numeric_limits<int>::max())
1596 return false;
1597
1598 const SIInstrInfo *TII = ST.getInstrInfo();
1599 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1600 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1601 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1602 .addImm(0);
1603
1604 return true;
1605}
1606
1607bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1609 return false;
1610
1611 const int NoHazardWaitStates = 15;
1612 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1613 const Register VDSTReg = VDST->getReg();
1614
1615 bool VisitedTrans = false;
1616 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1617 if (!SIInstrInfo::isVALU(I))
1618 return false;
1619 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1620 // Cover both WAR and WAW
1621 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1622 };
1623 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1624 if (WaitStates >= NoHazardWaitStates)
1625 return true;
1626 // Instructions which cause va_vdst==0 expire hazard
1629 };
1630 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1631 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1632 };
1633
1634 DenseSet<const MachineBasicBlock *> Visited;
1635 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1636 std::next(MI->getReverseIterator()), 0,
1637 IsExpiredFn, Visited, GetWaitStatesFn);
1638
1639 // Transcendentals can execute in parallel to other VALUs.
1640 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1641 if (VisitedTrans)
1642 Count = 0;
1643
1644 MachineOperand *WaitVdstOp =
1645 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1646 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1647
1648 return true;
1649}
1650
1651bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1653 return false;
1654
1655 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1656 const Register VDSTReg = VDST->getReg();
1657
1658 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1660 return false;
1661 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1662 };
1663 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1664 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1665 // according to the type of VMEM instruction.
1666 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1668 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1669 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1670 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1671 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1672 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1673 };
1674
1675 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1676 std::numeric_limits<int>::max())
1677 return false;
1678
1679 if (LdsdirCanWait) {
1680 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1681 } else {
1682 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1683 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1685 }
1686
1687 return true;
1688}
1689
1690bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1691 if (!ST.hasVALUPartialForwardingHazard())
1692 return false;
1693 assert(!ST.hasExtendedWaitCounts());
1694
1695 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1696 return false;
1697
1698 SmallSetVector<Register, 4> SrcVGPRs;
1699
1700 for (const MachineOperand &Use : MI->explicit_uses()) {
1701 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1702 SrcVGPRs.insert(Use.getReg());
1703 }
1704
1705 // Only applies with >= 2 unique VGPR sources
1706 if (SrcVGPRs.size() <= 1)
1707 return false;
1708
1709 // Look for the following pattern:
1710 // Va <- VALU [PreExecPos]
1711 // intv1
1712 // Exec <- SALU [ExecPos]
1713 // intv2
1714 // Vb <- VALU [PostExecPos]
1715 // intv3
1716 // MI Va, Vb (WaitState = 0)
1717 //
1718 // Where:
1719 // intv1 + intv2 <= 2 VALUs
1720 // intv3 <= 4 VALUs
1721 //
1722 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1723
1724 const int Intv1plus2MaxVALUs = 2;
1725 const int Intv3MaxVALUs = 4;
1726 const int IntvMaxVALUs = 6;
1727 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1728
1729 struct StateType {
1730 SmallDenseMap<Register, int, 4> DefPos;
1731 int ExecPos = std::numeric_limits<int>::max();
1732 int VALUs = 0;
1733
1734 static unsigned getHashValue(const StateType &State) {
1735 return hash_combine(State.ExecPos, State.VALUs,
1736 hash_combine_range(State.DefPos));
1737 }
1738 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1739 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1740 LHS.VALUs == RHS.VALUs;
1741 }
1742 };
1743
1744 StateType State;
1745
1746 // This overloads expiry testing with all the hazard detection
1747 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1748 // Too many VALU states have passed
1749 if (State.VALUs > NoHazardVALUWaitStates)
1750 return HazardExpired;
1751
1752 // Instructions which cause va_vdst==0 expire hazard
1755 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1756 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1757 return HazardExpired;
1758
1759 // Track registers writes
1760 bool Changed = false;
1761 if (SIInstrInfo::isVALU(I)) {
1762 for (Register Src : SrcVGPRs) {
1763 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1764 State.DefPos[Src] = State.VALUs;
1765 Changed = true;
1766 }
1767 }
1768 } else if (SIInstrInfo::isSALU(I)) {
1769 if (State.ExecPos == std::numeric_limits<int>::max()) {
1770 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1771 State.ExecPos = State.VALUs;
1772 Changed = true;
1773 }
1774 }
1775 }
1776
1777 // Early expiration: too many VALUs in intv3
1778 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1779 return HazardExpired;
1780
1781 // Only evaluate state if something changed
1782 if (!Changed)
1783 return NoHazardFound;
1784
1785 // Determine positions of VALUs pre/post exec change
1786 if (State.ExecPos == std::numeric_limits<int>::max())
1787 return NoHazardFound;
1788
1789 int PreExecPos = std::numeric_limits<int>::max();
1790 int PostExecPos = std::numeric_limits<int>::max();
1791
1792 for (auto Entry : State.DefPos) {
1793 int DefVALUs = Entry.second;
1794 if (DefVALUs != std::numeric_limits<int>::max()) {
1795 if (DefVALUs >= State.ExecPos)
1796 PreExecPos = std::min(PreExecPos, DefVALUs);
1797 else
1798 PostExecPos = std::min(PostExecPos, DefVALUs);
1799 }
1800 }
1801
1802 // Need a VALUs post exec change
1803 if (PostExecPos == std::numeric_limits<int>::max())
1804 return NoHazardFound;
1805
1806 // Too many VALUs in intv3?
1807 int Intv3VALUs = PostExecPos;
1808 if (Intv3VALUs > Intv3MaxVALUs)
1809 return HazardExpired;
1810
1811 // Too many VALUs in intv2?
1812 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1813 if (Intv2VALUs > Intv1plus2MaxVALUs)
1814 return HazardExpired;
1815
1816 // Need a VALUs pre exec change
1817 if (PreExecPos == std::numeric_limits<int>::max())
1818 return NoHazardFound;
1819
1820 // Too many VALUs in intv1?
1821 int Intv1VALUs = PreExecPos - State.ExecPos;
1822 if (Intv1VALUs > Intv1plus2MaxVALUs)
1823 return HazardExpired;
1824
1825 // Too many VALUs in intv1 + intv2
1826 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1827 return HazardExpired;
1828
1829 return HazardFound;
1830 };
1831 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1833 State.VALUs += 1;
1834 };
1835
1836 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1837 std::next(MI->getReverseIterator())))
1838 return false;
1839
1840 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1841 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1843
1844 return true;
1845}
1846
1847bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1848 if (!ST.hasVALUTransUseHazard())
1849 return false;
1850 assert(!ST.hasExtendedWaitCounts());
1851
1852 if (!SIInstrInfo::isVALU(*MI))
1853 return false;
1854
1855 SmallSet<Register, 4> SrcVGPRs;
1856
1857 for (const MachineOperand &Use : MI->explicit_uses()) {
1858 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1859 SrcVGPRs.insert(Use.getReg());
1860 }
1861
1862 // Look for the following pattern:
1863 // Va <- TRANS VALU
1864 // intv
1865 // MI Va (WaitState = 0)
1866 //
1867 // Where:
1868 // intv <= 5 VALUs / 1 TRANS
1869 //
1870 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1871
1872 const int IntvMaxVALUs = 5;
1873 const int IntvMaxTRANS = 1;
1874
1875 struct StateType {
1876 int VALUs = 0;
1877 int TRANS = 0;
1878
1879 static unsigned getHashValue(const StateType &State) {
1880 return hash_combine(State.VALUs, State.TRANS);
1881 }
1882 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1883 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1884 }
1885 };
1886
1887 StateType State;
1888
1889 // This overloads expiry testing with all the hazard detection
1890 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1891 // Too many VALU states have passed
1892 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1893 return HazardExpired;
1894
1895 // Instructions which cause va_vdst==0 expire hazard
1898 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1899 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1900 return HazardExpired;
1901
1902 // Track registers writes
1903 if (SIInstrInfo::isTRANS(I)) {
1904 for (Register Src : SrcVGPRs) {
1905 if (I.modifiesRegister(Src, &TRI)) {
1906 return HazardFound;
1907 }
1908 }
1909 }
1910
1911 return NoHazardFound;
1912 };
1913 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1915 State.VALUs += 1;
1917 State.TRANS += 1;
1918 };
1919
1920 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1921 std::next(MI->getReverseIterator())))
1922 return false;
1923
1924 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1925 // avoided.
1926 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1927 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1929
1930 return true;
1931}
1932
1933bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1934 if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1936 return false;
1937
1938 const SIInstrInfo *TII = ST.getInstrInfo();
1939 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1940
1941 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1942 if (!SIInstrInfo::isTRANS(I))
1943 return false;
1944
1945 // RAW: Trans(I) writes, VALU(MI) reads.
1946 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1947 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1948 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1949 return true;
1950 }
1951
1952 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1953 if (!ValuDst || !ValuDst->isReg())
1954 return false;
1955
1956 // WAR: Trans(I) reads, VALU(MI) writes.
1957 Register ValuDef = ValuDst->getReg();
1958 for (const MachineOperand &TransUse : I.explicit_uses()) {
1959 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1960 return true;
1961 }
1962
1963 return false;
1964 };
1965
1966 auto IsExpiredFn = [](const MachineInstr &I, int) {
1967 return SIInstrInfo::isVALU(I);
1968 };
1969
1970 const int HasVALU = std::numeric_limits<int>::max();
1971 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1972 return false;
1973
1974 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1975 return true;
1976}
1977
1978bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1980 return false;
1981
1982 const SIInstrInfo *TII = ST.getInstrInfo();
1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1984
1985 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1987 return false;
1988
1989 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1990 // with the dest(matrix D) of the previous wmma.
1991 const Register CurSrc0Reg =
1992 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1993 const Register CurSrc1Reg =
1994 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1995
1996 const Register PrevDstReg =
1997 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1998
1999 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2000 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2001 return true;
2002 }
2003
2004 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2005 // but Index can't overlap with PrevDstReg.
2006 if (AMDGPU::isGFX12Plus(ST)) {
2007 if (SIInstrInfo::isSWMMAC(*MI)) {
2008 const Register CurIndex =
2009 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2010 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2011 return true;
2012 }
2013 return false;
2014 }
2015
2016 return false;
2017 };
2018
2019 auto IsExpiredFn = [](const MachineInstr &I, int) {
2020 return SIInstrInfo::isVALU(I);
2021 };
2022
2023 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2024 std::numeric_limits<int>::max())
2025 return false;
2026
2027 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2028
2029 return true;
2030}
2031
2034 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2035}
2036
2038 const SIInstrInfo *TII, unsigned Latency,
2039 unsigned Category) {
2040 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2041 "Handle me if the xdl wmma instruction latency changes");
2042
2043 switch (Category) {
2044 case 0: // Dense WMMA Instructions:
2045 // WMMA_*F16, WMMA_*BF16
2046 // WMMA_*FP8FP8
2047 // WMMA_*FP8BF8
2048 // WMMA_*BF8FP8
2049 // WMMA_*BF8BF8
2050 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2051 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2052
2053 case 1: // Dense WMMA Instructions:
2054 // WMMA_IU8
2055 // WMMA_IU4
2056 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2057 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2058
2059 case 2: // Dense SWMMAC Instructions
2060 // SWMMAC_*F16, SWMMAC_*BF16,
2061 // SWMMAC_*FP8FP8
2062 // SWMMAC_*BF8FP8
2063 // SWMMAC_*FP8BF8
2064 // SWMMAC_*BF8BF8
2065 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2066
2067 case 3: // Sparse WMMA Instructions:
2068 // SWMMAC_IU8
2069 // SWMMAC_IU4
2070 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2071 default:
2072 break;
2073 } // end switch.
2074
2075 return false;
2076}
2077
2078int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
2079 if (!AMDGPU::isGFX1250(ST))
2080 return 0;
2081
2082 const SIInstrInfo *TII = ST.getInstrInfo();
2083 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2084 return 0;
2085
2086 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2087
2088 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2089 // be in between the first WMMA and the second instruction to cover the hazard
2090 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2091 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2092 // numbers, which depends on the category of the first WMMA.
2093 const int WMMAWaitStates[] = {5, 9, 3, 5};
2094 const int VALUWaitStates[] = {4, 8, 2, 4};
2095 unsigned Category = 0;
2096
2097 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2098 if (!TII->isXDLWMMA(I))
2099 return false;
2100
2101 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2102 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2103 return false;
2104
2105 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2106 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2107 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2108
2109 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2110 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2111 return true;
2112
2113 if (SIInstrInfo::isSWMMAC(*MI)) {
2114 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2115 if (TRI->regsOverlap(D0, Idx1))
2116 return true;
2117 }
2118
2119 return false;
2120 };
2121
2122 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2123 if (!TII->isXDLWMMA(I))
2124 return false;
2125
2126 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2127 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2128 return false;
2129
2130 // WMMA writes, VALU reads.
2131 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2132 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2133 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2134 return true;
2135 }
2136
2137 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2138 if (!ValuDst || !ValuDst->isReg())
2139 return false;
2140 Register D1 = ValuDst->getReg();
2141
2142 // WMMA writes, VALU writes.
2143 if (TRI->regsOverlap(D0, D1))
2144 return true;
2145
2146 // WMMA reads, VALU writes.
2147 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2148 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2149 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2150 return true;
2151
2152 if (SIInstrInfo::isSWMMAC(I)) {
2153 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2154 if (TRI->regsOverlap(D1, Idx0))
2155 return true;
2156 }
2157
2158 return false;
2159 };
2160
2161 int Limit = 0;
2162
2163 auto GetWaitStatesFn = [](const MachineInstr &I) {
2164 return SIInstrInfo::isVALU(I) ? 1 : 0;
2165 };
2166
2167 int WaitStatesNeeded = -1;
2168 if (TII->isXDLWMMA(*MI)) {
2169 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2170 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2171 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2172 // exists, and INT_MAX if there is no hazard. As a result, a negative
2173 // WaitStatesNeeded here means no hazard, and we will continue to search
2174 // for other categories.
2175 WaitStatesNeeded =
2176 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2177 }
2178 } else { // Must be a co-executable VALU.
2179 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2180 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2181 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2182 // exists, and INT_MAX if there is no hazard. As a result, a negative
2183 // WaitStatesNeeded here means no hazard, and we will continue to search
2184 // for other categories.
2185 WaitStatesNeeded =
2186 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2187 }
2188 }
2189
2190 return WaitStatesNeeded;
2191}
2192
2193bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2194 if (!ST.hasShift64HighRegBug())
2195 return false;
2196 assert(!ST.hasExtendedWaitCounts());
2197
2198 switch (MI->getOpcode()) {
2199 default:
2200 return false;
2201 case AMDGPU::V_LSHLREV_B64_e64:
2202 case AMDGPU::V_LSHRREV_B64_e64:
2203 case AMDGPU::V_ASHRREV_I64_e64:
2204 break;
2205 }
2206
2207 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2208 if (!Amt->isReg())
2209 return false;
2210
2211 Register AmtReg = Amt->getReg();
2212 const MachineRegisterInfo &MRI = MF.getRegInfo();
2213 // Check if this is a last VGPR in the allocation block.
2214 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2215 return false;
2216
2217 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2218 return false;
2219
2220 assert(ST.needsAlignedVGPRs());
2221 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2222
2223 const DebugLoc &DL = MI->getDebugLoc();
2224 MachineBasicBlock *MBB = MI->getParent();
2225 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2226
2227 // In:
2228 //
2229 // Dst = shiftrev64 Amt, Src1
2230 //
2231 // if Dst!=Src1 then avoid the bug with:
2232 //
2233 // Dst.sub0 = Amt
2234 // Dst = shift64 Dst.sub0, Src1
2235
2236 Register DstReg = MI->getOperand(0).getReg();
2237 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2238 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2239 runOnInstruction(
2240 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2241 Amt->setReg(DstLo);
2242 Amt->setIsKill(true);
2243 return true;
2244 }
2245
2246 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2247 Register NewReg;
2248 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2249 : AMDGPU::VGPR_32RegClass) {
2250 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2251 NewReg = Reg;
2252 break;
2253 }
2254 }
2255
2256 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2257 : NewReg;
2258 Register NewAmtLo;
2259
2260 if (Overlapped)
2261 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2262
2263 // Insert a full wait count because found register might be pending a wait.
2264 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2265 .addImm(0);
2266
2267 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2268 if (Overlapped)
2269 runOnInstruction(
2270 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2271 .addDef(AmtReg - 1)
2272 .addReg(AmtReg - 1, RegState::Undef)
2273 .addReg(NewAmtLo, RegState::Undef));
2274 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2275 .addDef(AmtReg)
2276 .addReg(AmtReg, RegState::Undef)
2277 .addReg(NewAmt, RegState::Undef));
2278
2279 // Instructions emitted after the current instruction will be processed by the
2280 // parent loop of the hazard recognizer in a natural way.
2281 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2282 AmtReg)
2283 .addDef(NewAmt)
2284 .addReg(NewAmt)
2285 .addReg(AmtReg);
2286 if (Overlapped)
2287 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2288 AmtReg - 1)
2289 .addDef(NewAmtLo)
2290 .addReg(NewAmtLo)
2291 .addReg(AmtReg - 1);
2292
2293 // Re-running hazard recognizer on the modified instruction is not necessary,
2294 // inserted V_SWAP_B32 has already both read and write new registers so
2295 // hazards related to these register has already been handled.
2296 Amt->setReg(NewAmt);
2297 Amt->setIsKill(false);
2298 // We do not update liveness, so verifier may see it as undef.
2299 Amt->setIsUndef();
2300 if (Overlapped) {
2301 MI->getOperand(0).setReg(NewReg);
2302 Src1->setReg(NewReg);
2303 Src1->setIsKill(false);
2304 Src1->setIsUndef();
2305 }
2306
2307 return true;
2308}
2309
2310int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2311 int NSAtoVMEMWaitStates = 1;
2312
2313 if (!ST.hasNSAtoVMEMBug())
2314 return 0;
2315
2317 return 0;
2318
2319 const SIInstrInfo *TII = ST.getInstrInfo();
2320 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2321 if (!Offset || (Offset->getImm() & 6) == 0)
2322 return 0;
2323
2324 auto IsHazardFn = [TII](const MachineInstr &I) {
2325 if (!SIInstrInfo::isMIMG(I))
2326 return false;
2327 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2328 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2329 TII->getInstSizeInBytes(I) >= 16;
2330 };
2331
2332 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2333}
2334
2335int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2336 int FPAtomicToDenormModeWaitStates = 3;
2337
2338 if (!ST.hasFPAtomicToDenormModeHazard())
2339 return 0;
2340 assert(!ST.hasExtendedWaitCounts());
2341
2342 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2343 return 0;
2344
2345 auto IsHazardFn = [](const MachineInstr &I) {
2346 if (!SIInstrInfo::isVMEM(I))
2347 return false;
2348 return SIInstrInfo::isFPAtomic(I);
2349 };
2350
2351 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2352 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2353 return true;
2354
2355 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2356 };
2357
2358 return FPAtomicToDenormModeWaitStates -
2359 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2360}
2361
2362int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2364
2365 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2366}
2367
2368int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2369 // Early exit if no padding is requested.
2370 if (MFMAPaddingRatio == 0)
2371 return 0;
2372
2373 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2374 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2375 return 0;
2376
2377 int NeighborMFMALatency = 0;
2378 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2379 this](const MachineInstr &MI) {
2380 if (!SIInstrInfo::isMFMA(MI))
2381 return false;
2382
2383 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2384 return true;
2385 };
2386
2387 const int MaxMFMAPipelineWaitStates = 16;
2388 int WaitStatesSinceNeighborMFMA =
2389 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2390
2391 int NeighborMFMAPaddingNeeded =
2392 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2393 WaitStatesSinceNeighborMFMA;
2394
2395 return std::max(0, NeighborMFMAPaddingNeeded);
2396}
2397
2398int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2399 int WaitStatesNeeded = 0;
2400 unsigned Opc = MI->getOpcode();
2401
2402 auto IsVALUFn = [](const MachineInstr &MI) {
2403 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2404 };
2405
2406 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2407 const int LegacyVALUWritesVGPRWaitStates = 2;
2408 const int VALUWritesExecWaitStates = 4;
2409 const int MaxWaitStates = 4;
2410
2411 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2412 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2413 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2414
2415 if (WaitStatesNeeded < MaxWaitStates) {
2416 for (const MachineOperand &Use : MI->explicit_uses()) {
2417 const int MaxWaitStates = 2;
2418
2419 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2420 continue;
2421
2422 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2423 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2424 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2425
2426 if (WaitStatesNeeded == MaxWaitStates)
2427 break;
2428 }
2429 }
2430 }
2431
2432 for (const MachineOperand &Op : MI->explicit_operands()) {
2433 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2434 continue;
2435
2436 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2437 continue;
2438
2439 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2440 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2441 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2442 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2443 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2444 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2445 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2446 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2447 const int MaxWaitStates = 18;
2448 Register Reg = Op.getReg();
2449 unsigned HazardDefLatency = 0;
2450
2451 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2452 this](const MachineInstr &MI) {
2453 if (!SIInstrInfo::isMFMA(MI))
2454 return false;
2455 Register DstReg = MI.getOperand(0).getReg();
2456 if (DstReg == Reg)
2457 return false;
2458 HazardDefLatency =
2459 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2460 return TRI.regsOverlap(DstReg, Reg);
2461 };
2462
2463 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2464 MaxWaitStates);
2465 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2466 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2467 int OpNo = Op.getOperandNo();
2468 if (OpNo == SrcCIdx) {
2469 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2470 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2471 switch (HazardDefLatency) {
2472 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2473 break;
2474 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2475 break;
2476 case 16: [[fallthrough]];
2477 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2478 break;
2479 }
2480 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2481 switch (HazardDefLatency) {
2482 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2483 break;
2484 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2485 break;
2486 case 16: [[fallthrough]];
2487 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2488 break;
2489 }
2490 }
2491
2492 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2493 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2494
2495 if (WaitStatesNeeded == MaxWaitStates)
2496 return WaitStatesNeeded; // Early exit.
2497
2498 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2499 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2500 return false;
2501 Register DstReg = MI.getOperand(0).getReg();
2502 return TRI.regsOverlap(Reg, DstReg);
2503 };
2504
2505 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2506 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2507 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2508 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2509 if (OpNo == SrcCIdx)
2510 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2511 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2512 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2513
2514 WaitStatesNeededForUse = NeedWaitStates -
2515 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2516 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2517
2518 if (WaitStatesNeeded == MaxWaitStates)
2519 return WaitStatesNeeded; // Early exit.
2520 }
2521
2522 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2523 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2524 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2525 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2526 const int MaxWaitStates = 13;
2527 Register DstReg = MI->getOperand(0).getReg();
2528 unsigned HazardDefLatency = 0;
2529
2530 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2531 this](const MachineInstr &MI) {
2532 if (!SIInstrInfo::isMFMA(MI))
2533 return false;
2534 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2535 HazardDefLatency =
2536 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2537 return TRI.regsOverlap(Reg, DstReg);
2538 };
2539
2540 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2541 int NeedWaitStates;
2542 switch (HazardDefLatency) {
2543 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2544 break;
2545 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2546 break;
2547 case 16: [[fallthrough]];
2548 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2549 break;
2550 }
2551
2552 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2553 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2554 }
2555
2556 // Pad neighboring MFMA with noops for better inter-wave performance.
2557 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2558
2559 return WaitStatesNeeded;
2560}
2561
2562static int
2564 bool IsGFX950) {
2565 // xdl def cycles | gfx940 | gfx950
2566 // 2 pass | 3 4
2567 // 4 pass | 5 6
2568 // 8 pass | 9 10
2569 // 16 pass | 17 18
2570 return NumPasses + 1 + IsGFX950;
2571}
2572
2573static int
2575 bool IsGFX950) {
2576 // xdl def cycles | gfx940 | gfx950
2577 // 2 pass | 3 3
2578 // 4 pass | 5 6
2579 // 8 pass | 9 10
2580 // 16 pass | 17 18
2581 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2582}
2583
2584static int
2586 // 2 pass -> 2
2587 // 4 pass -> 4
2588 // 8 pass -> 8
2589 // 16 pass -> 16
2590 return NumPasses;
2591}
2592
2593static int
2595 // 2 pass -> 4
2596 // 4 pass -> 6
2597 // 8 pass -> 10
2598 // 16 pass -> 18
2599 return NumPasses + 2;
2600}
2601
2603 bool IsGFX950) {
2604 // xdl def cycles | gfx942 | gfx950
2605 // 2 pass | 5 5
2606 // 4 pass | 7 8
2607 // 8 pass | 11 12
2608 // 16 pass | 19 20
2609 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2610}
2611
2612int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2613 int WaitStatesNeeded = 0;
2614 unsigned Opc = MI->getOpcode();
2615
2616 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2618 };
2619
2620 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2623 };
2624
2625 if (!SIInstrInfo::isMFMA(*MI))
2626 return WaitStatesNeeded;
2627
2628 const int VALUWritesExecWaitStates = 4;
2629 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2630 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2631 VALUWritesExecWaitStates);
2632 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2633
2634 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2635
2636 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2637 for (const MachineOperand &Use : MI->explicit_uses()) {
2638 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2639 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2640 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2641 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2642 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2643 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2644 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2645 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2646 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2647 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2648 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2649 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2650 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2651 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2652 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2653 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2654 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2655 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2656 const int MaxWaitStates = 19;
2657
2658 if (!Use.isReg())
2659 continue;
2660 Register Reg = Use.getReg();
2661 bool FullReg;
2662 const MachineInstr *MI1;
2663
2664 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2665 this](const MachineInstr &MI) {
2666 if (!SIInstrInfo::isMFMA(MI))
2667 return false;
2668 Register DstReg = MI.getOperand(0).getReg();
2669 FullReg = (DstReg == Reg);
2670 MI1 = &MI;
2671 return TRI.regsOverlap(DstReg, Reg);
2672 };
2673
2674 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2675 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2676 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2677
2678 int NumWaitStates =
2679 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2680 if (NumWaitStates == std::numeric_limits<int>::max())
2681 continue;
2682
2683 int OpNo = Use.getOperandNo();
2684 unsigned Opc1 = MI1->getOpcode();
2685 int NeedWaitStates = 0;
2686 if (OpNo == SrcCIdx) {
2687 if (!SIInstrInfo::isDGEMM(Opc) &&
2688 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2689 NeedWaitStates = 0;
2690 } else if (FullReg) {
2691 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2692 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2693 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2694 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2695 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2696 else if (ST.hasGFX940Insts() &&
2697 TSchedModel.computeInstrLatency(MI1) == 2)
2698 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2699 } else {
2700 switch (Opc1) {
2701 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2702 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2703 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2704 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2705 if (!TII.isXDL(*MI))
2706 NeedWaitStates =
2707 ST.hasGFX950Insts()
2708 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2709 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2710 break;
2711 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2712 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2713 if (!TII.isXDL(*MI))
2714 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2715 break;
2716 default:
2717 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2718 if (ST.hasGFX940Insts()) {
2719 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2720 break;
2721
2722 NeedWaitStates =
2723 TII.isXDL(*MI1)
2724 ? (TII.isXDL(*MI)
2726 NumPasses, ST.hasGFX950Insts())
2728 NumPasses, ST.hasGFX950Insts()))
2730 NumPasses);
2731 break;
2732 }
2733
2734 switch (NumPasses) {
2735 case 2:
2736 NeedWaitStates =
2738 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2739 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2740 break;
2741 case 8:
2742 NeedWaitStates =
2744 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2745 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2746 break;
2747 case 16:
2748 NeedWaitStates =
2750 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2751 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2752 break;
2753 default:
2754 llvm_unreachable("unexpected number of passes");
2755 }
2756 }
2757 }
2758 } else {
2759 switch (Opc1) {
2760 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2761 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2762 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2763 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2764 NeedWaitStates =
2765 ST.hasGFX950Insts()
2766 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2767 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2768 break;
2769 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2770 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2771 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2772 break;
2773 default:
2774 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2775
2776 if (ST.hasGFX940Insts()) {
2777 NeedWaitStates =
2778 TII.isXDL(*MI1)
2780 NumPasses, ST.hasGFX950Insts())
2782 NumPasses);
2783 break;
2784 }
2785
2786 switch (NumPasses) {
2787 case 2:
2788 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2789 break;
2790 case 4:
2791 llvm_unreachable("unexpected number of passes for mfma");
2792 case 8:
2793 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2794 break;
2795 case 16:
2796 default:
2797 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2798 }
2799 }
2800 }
2801 if (WaitStatesNeeded >= NeedWaitStates)
2802 continue;
2803
2804 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2805 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2806
2807 if (WaitStatesNeeded == MaxWaitStates)
2808 break;
2809 }
2810
2811 // Pad neighboring MFMA with noops for better inter-wave performance.
2812 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2813
2814 return WaitStatesNeeded;
2815}
2816
2817int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2818 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2819 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2820 return 0;
2821
2822 int WaitStatesNeeded = 0;
2823
2824 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2825 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2826 };
2827
2828 for (const MachineOperand &Op : MI->explicit_uses()) {
2829 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2830 continue;
2831
2832 Register Reg = Op.getReg();
2833
2834 const int AccVgprReadLdStWaitStates = 2;
2835 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2836 const int MaxWaitStates = 2;
2837
2838 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2839 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2840 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2841
2842 if (WaitStatesNeeded == MaxWaitStates)
2843 return WaitStatesNeeded; // Early exit.
2844
2845 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2846 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2847 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2848 return false;
2849 auto IsVALUFn = [](const MachineInstr &MI) {
2851 };
2852 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2853 std::numeric_limits<int>::max();
2854 };
2855
2856 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2857 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2858 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2859 }
2860
2861 return WaitStatesNeeded;
2862}
2863
2864int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2865 assert(!ST.hasVcmpxPermlaneHazard() &&
2866 "this is a different vcmpx+permlane hazard");
2867 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2868 const SIInstrInfo *TII = ST.getInstrInfo();
2869
2870 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2871 return isVCmpXWritesExec(*TII, *TRI, MI);
2872 };
2873
2874 auto IsVALUFn = [](const MachineInstr &MI) {
2875 return SIInstrInfo::isVALU(MI);
2876 };
2877
2878 const int VCmpXWritesExecWaitStates = 4;
2879 const int VALUWritesVDstWaitStates = 2;
2880 int WaitStatesNeeded = 0;
2881
2882 for (const MachineOperand &Op : MI->explicit_uses()) {
2883 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2884 continue;
2885 Register Reg = Op.getReg();
2886
2887 int WaitStatesSinceDef =
2888 VALUWritesVDstWaitStates -
2889 getWaitStatesSinceDef(Reg, IsVALUFn,
2890 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2891 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2892 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2893 break;
2894 }
2895
2896 int VCmpXHazardWaits =
2897 VCmpXWritesExecWaitStates -
2898 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2899
2900 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2901 return WaitStatesNeeded;
2902}
2903
2905 // 2 pass -> 4
2906 // 4 pass -> 6
2907 // 8 pass -> 10
2908 // 16 pass -> 18
2909 return NumPasses + 2;
2910}
2911
2913 bool IsGFX950) {
2914 // xdl def cycles | gfx942 | gfx950
2915 // 2 pass | 5 5
2916 // 4 pass | 7 8
2917 // 8 pass | 11 12
2918 // 16 pass | 19 20
2919 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2920}
2921
2923 bool IsGFX950) {
2924 // xdl def cycles | gfx942 | gfx950
2925 // 2 pass | 5 5
2926 // 4 pass | 7 8
2927 // 8 pass | 11 12
2928 // 16 pass | 19 20
2929 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2930}
2931
2933 // 2 pass -> 4
2934 // 4 pass -> 6
2935 // 8 pass -> 10
2936 // 16 pass -> 18
2937 return NumPasses + 2;
2938}
2939
2940int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2941 if (!ST.hasGFX90AInsts())
2942 return 0;
2943
2944 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2945 return SIInstrInfo::isDGEMM(MI.getOpcode());
2946 };
2947
2948 // This is checked in checkMAIHazards90A()
2949 if (SIInstrInfo::isMFMA(*MI))
2950 return 0;
2951
2952 const MachineRegisterInfo &MRI = MF.getRegInfo();
2953
2954 int WaitStatesNeeded = 0;
2955
2956 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2957 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2958 bool IsVALU = SIInstrInfo::isVALU(*MI);
2959
2960 const MachineInstr *MFMA = nullptr;
2961 unsigned Reg;
2962 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2963 if (!SIInstrInfo::isMFMA(MI) ||
2964 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2965 return false;
2966 MFMA = &MI;
2967 return true;
2968 };
2969
2970 const MachineInstr *DOT = nullptr;
2971 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2972 if (!SIInstrInfo::isDOT(MI) ||
2973 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2974 return false;
2975 DOT = &MI;
2976 return true;
2977 };
2978
2979 bool DGEMMAfterVALUWrite = false;
2980 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2981 // Found DGEMM on reverse traversal to def.
2982 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2983 DGEMMAfterVALUWrite = true;
2984
2985 // Only hazard if register is defined by a VALU and a DGEMM is found after
2986 // after the def.
2987 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2988 return false;
2989
2990 return true;
2991 };
2992
2993 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2994 AMDGPU::OpName::src2);
2995
2996 if (IsMemOrExport || IsVALU) {
2997 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2998 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2999 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3000 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3001 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3002 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3003 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3004 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3005 const int DotWriteSameDotReadSrcAB = 3;
3006 const int DotWriteDifferentVALURead = 3;
3007 const int DMFMABetweenVALUWriteVMEMRead = 2;
3008 const int MaxWaitStates = 19;
3009
3010 for (const MachineOperand &Use : MI->explicit_uses()) {
3011 if (!Use.isReg())
3012 continue;
3013 Reg = Use.getReg();
3014
3015 DOT = nullptr;
3016 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3017 MaxWaitStates);
3018 if (DOT) {
3019 int NeedWaitStates = 0;
3020 if (DOT->getOpcode() == MI->getOpcode()) {
3021 if (&Use - &MI->getOperand(0) != SrcCIdx)
3022 NeedWaitStates = DotWriteSameDotReadSrcAB;
3023 } else {
3024 NeedWaitStates = DotWriteDifferentVALURead;
3025 }
3026
3027 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3028 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3029 }
3030
3031 // Workaround for HW data hazard bug observed only in GFX90A. When there
3032 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3033 // causes the SQ to incorrectly not insert two wait states between the two
3034 // instructions needed to avoid data hazard.
3035 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3036 DGEMMAfterVALUWrite = false;
3037 if (TRI.isVectorRegister(MRI, Reg)) {
3038 int WaitStatesNeededForUse =
3039 DMFMABetweenVALUWriteVMEMRead -
3040 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3041 DMFMABetweenVALUWriteVMEMRead);
3042
3043 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3044 }
3045 }
3046
3047 MFMA = nullptr;
3048 WaitStatesSinceDef =
3049 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3050 if (!MFMA)
3051 continue;
3052
3053 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3054 int NumPasses = HazardDefLatency;
3055 int NeedWaitStates = MaxWaitStates;
3056
3057 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3058 switch (HazardDefLatency) {
3059 case 4:
3060 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3061 : DMFMA4x4WriteVgprVALUReadWaitStates;
3062 break;
3063 case 8:
3064 case 16:
3065 NeedWaitStates =
3066 IsMemOrExport
3067 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3068 : (ST.hasGFX950Insts()
3069 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3070 : DMFMA16x16WriteVgprVALUReadWaitStates);
3071 break;
3072 default:
3073 llvm_unreachable("unexpected dgemm");
3074 }
3075 } else if (ST.hasGFX940Insts()) {
3076 NeedWaitStates =
3077 TII.isXDL(*MFMA)
3079 NumPasses, ST.hasGFX950Insts())
3081 NumPasses);
3082 } else {
3083 switch (HazardDefLatency) {
3084 case 2:
3085 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3086 break;
3087 case 8:
3088 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3089 break;
3090 case 16:
3091 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3092 break;
3093 default:
3094 llvm_unreachable("unexpected number of passes for mfma");
3095 }
3096 }
3097
3098 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3099 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3100
3101 if (WaitStatesNeeded == MaxWaitStates)
3102 break;
3103 }
3104 }
3105
3106 unsigned Opc = MI->getOpcode();
3107 const int DMFMAToFMA64WaitStates = 2;
3108 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3109 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3110 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3111 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3112 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3113 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3114 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3115 }
3116
3117 if (!IsVALU && !IsMemOrExport)
3118 return WaitStatesNeeded;
3119
3120 for (const MachineOperand &Def : MI->defs()) {
3121 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3122 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3123 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3124 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3125 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3126 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3127 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3128 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3129 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3130 const int DotWriteDifferentVALUWrite = 3;
3131 const int MaxWaitStates = 19;
3132 const int MaxWarWaitStates = 15;
3133
3134 Reg = Def.getReg();
3135
3136 DOT = nullptr;
3137 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3138 MaxWaitStates);
3139 if (DOT && DOT->getOpcode() != MI->getOpcode())
3140 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3141 WaitStatesSinceDef);
3142
3143 MFMA = nullptr;
3144 WaitStatesSinceDef =
3145 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3146 if (MFMA) {
3147 int NeedWaitStates = MaxWaitStates;
3148 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3149
3150 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3151 switch (NumPasses) {
3152 case 4:
3153 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3154 break;
3155 case 8:
3156 case 16:
3157 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3158 break;
3159 default:
3160 llvm_unreachable("unexpected number of cycles for dgemm");
3161 }
3162 } else if (ST.hasGFX940Insts()) {
3163 NeedWaitStates =
3164 TII.isXDL(*MFMA)
3166 NumPasses, ST.hasGFX950Insts())
3168 } else {
3169 switch (NumPasses) {
3170 case 2:
3171 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3172 break;
3173 case 8:
3174 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3175 break;
3176 case 16:
3177 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3178 break;
3179 default:
3180 llvm_unreachable("Unexpected number of passes for mfma");
3181 }
3182 }
3183
3184 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3185 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3186
3187 if (WaitStatesNeeded == MaxWaitStates)
3188 break;
3189 }
3190
3191 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3192 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3193 !MI.readsRegister(Reg, &TRI))
3194 return false;
3195
3196 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3197 return false;
3198
3199 const MachineOperand *SrcC =
3200 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3201 assert(SrcC);
3202 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3203 return false;
3204
3205 MFMA = &MI;
3206 return true;
3207 };
3208
3209 MFMA = nullptr;
3210 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3211 MaxWarWaitStates);
3212 if (!MFMA)
3213 continue;
3214
3215 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3216 int NeedWaitStates = MaxWaitStates;
3217 switch (HazardDefLatency) {
3218 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3219 break;
3220 case 4: assert(ST.hasGFX940Insts());
3221 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3222 break;
3223 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3224 break;
3225 case 16: [[fallthrough]];
3226 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3227 break;
3228 }
3229
3230 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3231 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3232 }
3233
3234 return WaitStatesNeeded;
3235}
3236
3238 if (!SU->isInstr())
3239 return false;
3240
3241 const MachineInstr *MAI = nullptr;
3242
3243 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3244 MAI = nullptr;
3246 MAI = &MI;
3247 return MAI != nullptr;
3248 };
3249
3250 MachineInstr *MI = SU->getInstr();
3251 if (IsMFMAFn(*MI)) {
3252 int W = getWaitStatesSince(IsMFMAFn, 16);
3253 if (MAI)
3254 return W < (int)TSchedModel.computeInstrLatency(MAI);
3255 }
3256
3257 return false;
3258}
3259
3260// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3261// insertion of a new instruction.
3262static void updateGetPCBundle(MachineInstr *NewMI) {
3263 if (!NewMI->isBundled())
3264 return;
3265
3266 // Find start of bundle.
3267 auto I = NewMI->getIterator();
3268 while (I->isBundledWithPred())
3269 I--;
3270 if (I->isBundle())
3271 I++;
3272
3273 // Bail if this is not an S_GETPC bundle.
3274 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3275 return;
3276
3277 // Update offsets of any references in the bundle.
3278 const unsigned NewBytes = 4;
3279 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3280 "Unexpected instruction insertion in bundle");
3281 auto NextMI = std::next(NewMI->getIterator());
3282 auto End = NewMI->getParent()->end();
3283 while (NextMI != End && NextMI->isBundledWithPred()) {
3284 for (auto &Operand : NextMI->operands()) {
3285 if (Operand.isGlobal())
3286 Operand.setOffset(Operand.getOffset() + NewBytes);
3287 }
3288 NextMI++;
3289 }
3290}
3291
3292bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3293 if (!ST.hasVALUMaskWriteHazard())
3294 return false;
3295 assert(!ST.hasExtendedWaitCounts());
3296
3297 if (!ST.isWave64())
3298 return false;
3299
3300 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3301 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3302 if (!IsSALU && !IsVALU)
3303 return false;
3304
3305 // The hazard sequence is three instructions:
3306 // 1. VALU reads SGPR as mask
3307 // 2. VALU/SALU writes SGPR
3308 // 3. VALU/SALU reads SGPR
3309 // The hazard can expire if the distance between 2 and 3 is sufficient,
3310 // or (2) is VALU and (3) is SALU.
3311 // In practice this happens <10% of the time, hence always assume the hazard
3312 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3313
3314 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3315 const MachineRegisterInfo &MRI = MF.getRegInfo();
3316
3317 auto IgnoreableSGPR = [](const Register Reg) {
3318 switch (Reg) {
3319 case AMDGPU::EXEC:
3320 case AMDGPU::EXEC_LO:
3321 case AMDGPU::EXEC_HI:
3322 case AMDGPU::M0:
3323 case AMDGPU::SGPR_NULL:
3324 case AMDGPU::SGPR_NULL64:
3325 case AMDGPU::SCC:
3326 return true;
3327 default:
3328 return false;
3329 }
3330 };
3331 auto IsVCC = [](const Register Reg) {
3332 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3333 };
3334
3335 struct StateType {
3336 SmallSet<Register, 2> HazardSGPRs;
3337
3338 static unsigned getHashValue(const StateType &State) {
3339 return hash_combine_range(State.HazardSGPRs);
3340 }
3341 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3342 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3343 }
3344 };
3345
3346 SmallVector<const MachineInstr *> WaitInstrs;
3347 bool HasSGPRRead = false;
3348 StateType InitialState;
3349
3350 // Look for SGPR write.
3351 MachineOperand *HazardDef = nullptr;
3352 for (MachineOperand &Op : MI->operands()) {
3353 if (!Op.isReg())
3354 continue;
3355 if (Op.isDef() && HazardDef)
3356 continue;
3357
3358 Register Reg = Op.getReg();
3359 if (IgnoreableSGPR(Reg))
3360 continue;
3361 if (!IsVCC(Reg)) {
3362 if (Op.isImplicit())
3363 continue;
3364 if (!TRI->isSGPRReg(MRI, Reg))
3365 continue;
3366 }
3367 // Also check for SGPR reads.
3368 if (Op.isUse()) {
3369 HasSGPRRead = true;
3370 continue;
3371 }
3372
3373 assert(!HazardDef);
3374 HazardDef = &Op;
3375 }
3376
3377 if (!HazardDef)
3378 return false;
3379
3380 // Setup to track writes to individual SGPRs
3381 const Register HazardReg = HazardDef->getReg();
3382 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3383 InitialState.HazardSGPRs.insert(HazardReg);
3384 } else {
3385 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3386 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3387 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3388 }
3389
3390 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3391 if (State.HazardSGPRs.empty())
3392 return HazardExpired;
3393
3394 switch (I.getOpcode()) {
3395 case AMDGPU::V_ADDC_U32_e32:
3396 case AMDGPU::V_ADDC_U32_dpp:
3397 case AMDGPU::V_CNDMASK_B16_t16_e32:
3398 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3399 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3400 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3401 case AMDGPU::V_CNDMASK_B32_e32:
3402 case AMDGPU::V_CNDMASK_B32_dpp:
3403 case AMDGPU::V_DIV_FMAS_F32_e64:
3404 case AMDGPU::V_DIV_FMAS_F64_e64:
3405 case AMDGPU::V_SUBB_U32_e32:
3406 case AMDGPU::V_SUBB_U32_dpp:
3407 case AMDGPU::V_SUBBREV_U32_e32:
3408 case AMDGPU::V_SUBBREV_U32_dpp: {
3409 // These implicitly read VCC as mask source.
3410 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3411 }
3412 case AMDGPU::V_ADDC_U32_e64:
3413 case AMDGPU::V_ADDC_U32_e64_dpp:
3414 case AMDGPU::V_CNDMASK_B16_t16_e64:
3415 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3416 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3417 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3418 case AMDGPU::V_CNDMASK_B32_e64:
3419 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3420 case AMDGPU::V_SUBB_U32_e64:
3421 case AMDGPU::V_SUBB_U32_e64_dpp:
3422 case AMDGPU::V_SUBBREV_U32_e64:
3423 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3424 // Only check mask register overlaps.
3425 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3426 assert(SSRCOp);
3427 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3428 return Result ? HazardFound : NoHazardFound;
3429 }
3430 default:
3431 return NoHazardFound;
3432 }
3433 };
3434
3435 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3437 0),
3438 0);
3439 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3440 switch (I.getOpcode()) {
3441 case AMDGPU::S_WAITCNT_DEPCTR:
3442 // Record mergable waits within region of instructions free of SGPR reads.
3443 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3444 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3445 WaitInstrs.push_back(&I);
3446 break;
3447 default:
3448 // Update tracking of SGPR reads and writes.
3449 for (auto &Op : I.operands()) {
3450 if (!Op.isReg())
3451 continue;
3452
3453 Register Reg = Op.getReg();
3454 if (IgnoreableSGPR(Reg))
3455 continue;
3456 if (!IsVCC(Reg)) {
3457 if (Op.isImplicit())
3458 continue;
3459 if (!TRI->isSGPRReg(MRI, Reg))
3460 continue;
3461 }
3462 if (Op.isUse()) {
3463 HasSGPRRead = true;
3464 continue;
3465 }
3466
3467 // Stop tracking any SGPRs with writes on the basis that they will
3468 // already have an appropriate wait inserted afterwards.
3470 for (Register SGPR : State.HazardSGPRs) {
3471 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3472 Found.push_back(SGPR);
3473 }
3474 for (Register SGPR : Found)
3475 State.HazardSGPRs.erase(SGPR);
3476 }
3477 break;
3478 }
3479 };
3480
3481 // Check for hazard
3482 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3483 MI->getParent(),
3484 std::next(MI->getReverseIterator())))
3485 return false;
3486
3487 // Compute counter mask
3488 unsigned DepCtr =
3489 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3490 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3491 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3492
3493 // Try to merge previous waits into this one for regions with no SGPR reads.
3494 if (!WaitInstrs.empty()) {
3495 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3496 // obtain a mutable pointer to each instruction to be merged.
3497 // This is expected to be a very short walk within the same block.
3498 SmallVector<MachineInstr *> ToErase;
3499 unsigned Found = 0;
3500 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3501 End = MI->getParent()->rend();
3502 Found < WaitInstrs.size() && It != End; ++It) {
3503 MachineInstr *WaitMI = &*It;
3504 // Find next wait instruction.
3505 if (std::as_const(WaitMI) != WaitInstrs[Found])
3506 continue;
3507 Found++;
3508 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3509 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3510 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3511 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3512 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3513 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3514 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3515 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3516 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3517 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3518 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3519 ToErase.push_back(WaitMI);
3520 }
3521 assert(Found == WaitInstrs.size());
3522 for (MachineInstr *WaitMI : ToErase)
3523 WaitMI->eraseFromParent();
3524 }
3525
3526 // Add s_waitcnt_depctr after SGPR write.
3527 auto NextMI = std::next(MI->getIterator());
3528 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3529 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3530 .addImm(DepCtr);
3531
3532 // SALU write may be s_getpc in a bundle.
3533 updateGetPCBundle(NewMI);
3534
3535 return true;
3536}
3537
3538static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3539 const SIInstrInfo &TII) {
3540 MachineBasicBlock &EntryMBB = MF->front();
3541 if (EntryMBB.begin() != EntryMBB.end()) {
3542 auto &EntryMI = *EntryMBB.begin();
3543 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3544 EntryMI.getOperand(0).getImm() >= Priority)
3545 return false;
3546 }
3547
3548 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3549 .addImm(Priority);
3550 return true;
3551}
3552
3553bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3554 if (!ST.hasRequiredExportPriority())
3555 return false;
3556
3557 // Assume the following shader types will never have exports,
3558 // and avoid adding or adjusting S_SETPRIO.
3559 MachineBasicBlock *MBB = MI->getParent();
3560 MachineFunction *MF = MBB->getParent();
3561 auto CC = MF->getFunction().getCallingConv();
3562 switch (CC) {
3567 return false;
3568 default:
3569 break;
3570 }
3571
3572 const int MaxPriority = 3;
3573 const int NormalPriority = 2;
3574 const int PostExportPriority = 0;
3575
3576 auto It = MI->getIterator();
3577 switch (MI->getOpcode()) {
3578 case AMDGPU::S_ENDPGM:
3579 case AMDGPU::S_ENDPGM_SAVED:
3580 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3581 case AMDGPU::SI_RETURN_TO_EPILOG:
3582 // Ensure shader with calls raises priority at entry.
3583 // This ensures correct priority if exports exist in callee.
3584 if (MF->getFrameInfo().hasCalls())
3585 return ensureEntrySetPrio(MF, NormalPriority, TII);
3586 return false;
3587 case AMDGPU::S_SETPRIO: {
3588 // Raise minimum priority unless in workaround.
3589 auto &PrioOp = MI->getOperand(0);
3590 int Prio = PrioOp.getImm();
3591 bool InWA = (Prio == PostExportPriority) &&
3592 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3593 if (InWA || Prio >= NormalPriority)
3594 return false;
3595 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3596 return true;
3597 }
3598 default:
3599 if (!TII.isEXP(*MI))
3600 return false;
3601 break;
3602 }
3603
3604 // Check entry priority at each export (as there will only be a few).
3605 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3606 bool Changed = false;
3608 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3609
3610 auto NextMI = std::next(It);
3611 bool EndOfShader = false;
3612 if (NextMI != MBB->end()) {
3613 // Only need WA at end of sequence of exports.
3614 if (TII.isEXP(*NextMI))
3615 return Changed;
3616 // Assume appropriate S_SETPRIO after export means WA already applied.
3617 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3618 NextMI->getOperand(0).getImm() == PostExportPriority)
3619 return Changed;
3620 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3621 }
3622
3623 const DebugLoc &DL = MI->getDebugLoc();
3624
3625 // Lower priority.
3626 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3627 .addImm(PostExportPriority);
3628
3629 if (!EndOfShader) {
3630 // Wait for exports to complete.
3631 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3632 .addReg(AMDGPU::SGPR_NULL)
3633 .addImm(0);
3634 }
3635
3636 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3637 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3638
3639 if (!EndOfShader) {
3640 // Return to normal (higher) priority.
3641 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3642 .addImm(NormalPriority);
3643 }
3644
3645 return true;
3646}
3647
3648bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3649 if (!isSGetReg(MI->getOpcode()))
3650 return false;
3651
3652 const SIInstrInfo *TII = ST.getInstrInfo();
3653 switch (getHWReg(TII, *MI)) {
3654 default:
3655 return false;
3660 break;
3661 }
3662
3663 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3664 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3665 .addImm(0);
3666 return true;
3667}
3668
3669bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3670 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3671 return false;
3672
3673 const SIInstrInfo *TII = ST.getInstrInfo();
3674 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3675 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3677 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3678 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3680
3681 return true;
3682}
3683
3684bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3685 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3686 // for hazard to trigger.
3687 if (!IsHazardRecognizerMode)
3688 return false;
3689
3690 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3691 const SIInstrInfo *TII = ST.getInstrInfo();
3692 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3693 const int FlatScrBaseWaitStates = 10;
3694
3695 bool ReadsFlatScrLo =
3696 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3697 bool ReadsFlatScrHi =
3698 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3699 if (isSGetReg(MI->getOpcode())) {
3700 switch (getHWReg(TII, *MI)) {
3701 default:
3702 break;
3704 ReadsFlatScrLo = true;
3705 break;
3707 ReadsFlatScrHi = true;
3708 break;
3709 }
3710 }
3711
3712 const MachineRegisterInfo &MRI = MF.getRegInfo();
3713
3714 auto IsRegDefHazard = [&](Register Reg) -> bool {
3715 DenseSet<const MachineBasicBlock *> Visited;
3716 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3717 return MI.modifiesRegister(Reg, TRI);
3718 };
3719
3720 // This literally abuses the idea of waitstates. Instead of waitstates it
3721 // returns 1 for SGPR written and 0 otherwise.
3722 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3723 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3724 return 0;
3725 for (const MachineOperand &MO : MI.all_defs()) {
3726 if (TRI->isSGPRReg(MRI, MO.getReg()))
3727 return 1;
3728 }
3729 return 0;
3730 };
3731
3732 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3733 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3734 unsigned Wait = MI.getOperand(0).getImm();
3737 return true;
3738 }
3739 return SgprWrites >= FlatScrBaseWaitStates;
3740 };
3741
3742 return ::getWaitStatesSince(
3743 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3744 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3745 };
3746
3747 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3748 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3749 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3750 !IsRegDefHazard(AMDGPU::SGPR103)))
3751 return false;
3752
3753 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3754 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3757 return true;
3758}
3759
3760bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3761 if (!isSSetReg(MI->getOpcode()) ||
3762 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3763 return false;
3764
3765 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3766 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3767 return true;
3768}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
Op::Description Desc
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...