LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
18#include "llvm/ADT/Statistic.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(0, Value))
42 return O.error("'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
60 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
64 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
72 const GCNSubtarget &ST);
73
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
85 EmittedInstrs.clear();
86}
87
91
93 CurrCycleInstr = MI;
94}
95
96static bool isDivFMas(unsigned Opcode) {
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
98}
99
100static bool isSGetReg(unsigned Opcode) {
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
102}
103
104static bool isSSetReg(unsigned Opcode) {
105 switch (Opcode) {
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
110 return true;
111 }
112 return false;
113}
114
115static bool isRWLane(unsigned Opcode) {
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
117}
118
119static bool isRFE(unsigned Opcode) {
120 return Opcode == AMDGPU::S_RFE_B64;
121}
122
123static bool isSMovRel(unsigned Opcode) {
124 switch (Opcode) {
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
129 return true;
130 default:
131 return false;
132 }
133}
134
136 const MachineInstr &MI) {
137 if (TII.isAlwaysGDS(MI.getOpcode()))
138 return true;
139
140 switch (MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
144 return true;
145 // These DS opcodes don't support GDS.
146 case AMDGPU::DS_NOP:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
149 return false;
150 default:
151 if (TII.isDS(MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (MI.getOperand(GDS).getImm())
155 return true;
156 }
157 return false;
158 }
159}
160
161static bool isPermlane(const MachineInstr &MI) {
162 unsigned Opcode = MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
177}
178
179static bool isLdsDma(const MachineInstr &MI) {
180 return SIInstrInfo::isVALU(MI) &&
182}
183
184static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
185 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
186 AMDGPU::OpName::simm16);
187 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
188}
189
192 MachineInstr *MI = SU->getInstr();
193 // If we are not in "HazardRecognizerMode" and therefore not being run from
194 // the scheduler, track possible stalls from hazards but don't insert noops.
195 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
196
197 if (MI->isBundle())
198 return NoHazard;
199
200 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
204 return HazardType;
205
206 if (checkFPAtomicToDenormModeHazard(MI) > 0)
207 return HazardType;
208
209 // Hazards which cannot be mitigated with S_NOPs.
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(MI) > 0)
212 return Hazard;
213 }
214
215 if (ST.hasNoDataDepHazard())
216 return NoHazard;
217
218 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
219 return HazardType;
220
221 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
222 return HazardType;
223
224 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
225 return HazardType;
226
227 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
228 return HazardType;
229
230 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
231 return HazardType;
232
235 checkMAIVALUHazards(MI) > 0)
236 return HazardType;
237
238 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
239 return HazardType;
240
241 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
242 return HazardType;
243
244 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
245 return HazardType;
246
247 if (((ST.hasReadM0MovRelInterpHazard() &&
248 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
252 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
255 checkReadM0Hazards(MI) > 0)
256 return HazardType;
257
258 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
259 return HazardType;
260
262 checkMAILdStHazards(MI) > 0)
263 return HazardType;
264
265 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
266 return HazardType;
267
268 return NoHazard;
269}
270
272 unsigned Quantity) {
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
275 Quantity -= Arg;
276 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
277 .addImm(Arg - 1);
278 }
279}
280
281unsigned
282GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
287}
288
289void GCNHazardRecognizer::processBundle() {
290 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
291 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
292 // Check bundled MachineInstr's for hazards.
293 for (; MI != E && MI->isInsideBundle(); ++MI) {
294 CurrCycleInstr = &*MI;
295 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
296
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
299
300 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
301 }
302
303 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
304 // include the bundled MI directly after, only add a maximum of
305 // (MaxLookAhead - 1) noops to EmittedInstrs.
306 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
307 EmittedInstrs.push_front(nullptr);
308
309 EmittedInstrs.push_front(CurrCycleInstr);
310 EmittedInstrs.resize(MaxLookAhead);
311 }
312 CurrCycleInstr = nullptr;
313}
314
315void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
316 assert(IsHazardRecognizerMode);
317
318 unsigned NumPreNoops = PreEmitNoops(MI);
319 EmitNoops(NumPreNoops);
320 if (MI->isInsideBundle())
321 insertNoopsInBundle(MI, TII, NumPreNoops);
322 else
323 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
324 NumPreNoops);
326 AdvanceCycle();
327}
328
330 IsHazardRecognizerMode = true;
331 CurrCycleInstr = MI;
332 unsigned W = PreEmitNoopsCommon(MI);
333 fixHazards(MI);
334 CurrCycleInstr = nullptr;
335 return std::max(W, NopPadding.getValue());
336}
337
341
343 if (MI->isBundle())
344 return 0;
345
346 int WaitStates = 0;
347
349 return std::max(WaitStates, checkSMRDHazards(MI));
350
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
353
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
355
356 if (ST.hasNoDataDepHazard())
357 return WaitStates;
358
360 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
361
363 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
364
366 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
367
368 if (isDivFMas(MI->getOpcode()))
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
370
371 if (isRWLane(MI->getOpcode()))
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
373
376 checkMAIVALUHazards(MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
378
379 if (MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(MI));
381
382 if (isSGetReg(MI->getOpcode()))
383 return std::max(WaitStates, checkGetRegHazards(MI));
384
385 if (isSSetReg(MI->getOpcode()))
386 return std::max(WaitStates, checkSetRegHazards(MI));
387
388 if (isRFE(MI->getOpcode()))
389 return std::max(WaitStates, checkRFEHazards(MI));
390
391 if ((ST.hasReadM0MovRelInterpHazard() &&
392 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
395 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
396 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(MI));
400
402 return std::max(WaitStates, checkMAIHazards(MI));
403
405 return std::max(WaitStates, checkMAILdStHazards(MI));
406
407 if (ST.hasGFX950Insts() && isPermlane(*MI))
408 return std::max(WaitStates, checkPermlaneHazards(MI));
409
410 return WaitStates;
411}
412
414 EmittedInstrs.push_front(nullptr);
415}
416
418 // When the scheduler detects a stall, it will call AdvanceCycle() without
419 // emitting any instructions.
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(nullptr);
422 return;
423 }
424
425 if (CurrCycleInstr->isBundle()) {
426 processBundle();
427 return;
428 }
429
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr = nullptr;
433 return;
434 }
435
436 // Keep track of emitted instructions
437 EmittedInstrs.push_front(CurrCycleInstr);
438
439 // Add a nullptr for each additional wait state after the first. Make sure
440 // not to add more than getMaxLookAhead() items to the list, since we
441 // truncate the list to that size right after this loop.
442 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
443 i < e; ++i) {
444 EmittedInstrs.push_front(nullptr);
445 }
446
447 // getMaxLookahead() is the largest number of wait states we will ever need
448 // to insert, so there is no point in keeping track of more than that many
449 // wait states.
450 EmittedInstrs.resize(getMaxLookAhead());
451
452 CurrCycleInstr = nullptr;
453}
454
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
458}
459
460//===----------------------------------------------------------------------===//
461// Helper Functions
462//===----------------------------------------------------------------------===//
463
465
466// Search for a hazard in a block and its predecessors.
467template <typename StateT>
468static bool
469hasHazard(StateT InitialState,
470 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
471 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
472 const MachineBasicBlock *InitialMBB,
474 struct StateMapKey {
476 unsigned Idx;
477 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
478 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
479 }
480 };
481 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
482 static unsigned getHashValue(const StateMapKey &Key) {
483 return StateT::getHashValue((*Key.States)[Key.Idx]);
484 }
485 static unsigned getHashValue(const StateT &State) {
486 return StateT::getHashValue(State);
487 }
488 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
489 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
490 }
491 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
492 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
493 }
494 };
495
498
500 const MachineBasicBlock *MBB = InitialMBB;
501 StateT State = InitialState;
502
504 unsigned WorkIdx = 0;
505 for (;;) {
506 bool Expired = false;
507 for (auto E = MBB->instr_rend(); I != E; ++I) {
508 // No need to look at parent BUNDLE instructions.
509 if (I->isBundle())
510 continue;
511
512 auto Result = IsHazard(State, *I);
513 if (Result == HazardFound)
514 return true;
515 if (Result == HazardExpired) {
516 Expired = true;
517 break;
518 }
519
520 if (I->isInlineAsm() || I->isMetaInstruction())
521 continue;
522
523 UpdateState(State, *I);
524 }
525
526 if (!Expired) {
527 unsigned StateIdx = States.size();
528 StateMapKey Key = {&States, StateIdx};
529 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
530 if (Insertion.second) {
531 States.emplace_back(State);
532 } else {
533 StateIdx = Insertion.first->second;
534 }
535 for (MachineBasicBlock *Pred : MBB->predecessors())
536 Worklist.insert(std::pair(Pred, StateIdx));
537 }
538
539 if (WorkIdx == Worklist.size())
540 break;
541
542 unsigned StateIdx;
543 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
544 State = States[StateIdx];
545 I = MBB->instr_rbegin();
546 }
547
548 return false;
549}
550
551// Returns a minimum wait states since \p I walking all predecessors.
552// Only scans until \p IsExpired does not return true.
553// Can only be run in a hazard recognizer mode.
554static int
556 const MachineBasicBlock *MBB,
558 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
562 for (auto E = MBB->instr_rend(); I != E; ++I) {
563 // Don't add WaitStates for parent BUNDLE instructions.
564 if (I->isBundle())
565 continue;
566
567 if (IsHazard(*I))
568 return WaitStates;
569
570 if (I->isInlineAsm())
571 continue;
572
573 WaitStates += GetNumWaitStates(*I);
574
575 if (IsExpired(*I, WaitStates))
576 return std::numeric_limits<int>::max();
577 }
578
579 int MinWaitStates = std::numeric_limits<int>::max();
580 for (MachineBasicBlock *Pred : MBB->predecessors()) {
581 if (!Visited.insert(Pred).second)
582 continue;
583
584 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
585 IsExpired, Visited, GetNumWaitStates);
586
587 MinWaitStates = std::min(MinWaitStates, W);
588 }
589
590 return MinWaitStates;
591}
592
593static int
595 const MachineInstr *MI,
600 return getWaitStatesSince(IsHazard, MI->getParent(),
601 std::next(MI->getReverseIterator()), 0, IsExpired,
602 Visited, GetNumWaitStates);
603}
604
605int GCNHazardRecognizer::getWaitStatesSince(
606 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
607 if (IsHazardRecognizerMode) {
608 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
609 return WaitStates >= Limit;
610 };
611 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
612 GetNumWaitStates);
613 }
614
615 int WaitStates = 0;
616 for (MachineInstr *MI : EmittedInstrs) {
617 if (MI) {
618 if (IsHazard(*MI))
619 return WaitStates;
620
621 if (MI->isInlineAsm())
622 continue;
623 }
624 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
625
626 if (WaitStates >= Limit)
627 break;
628 }
629 return std::numeric_limits<int>::max();
630}
631
632int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
633 int Limit) const {
634 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
635}
636
637int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
638 IsHazardFn IsHazardDef,
639 int Limit) const {
640 const SIRegisterInfo *TRI = ST.getRegisterInfo();
641
642 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
643 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
644 };
645
646 return getWaitStatesSince(IsHazardFn, Limit);
647}
648
649int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
650 int Limit) const {
651 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
652 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
653 };
654
655 return getWaitStatesSince(IsHazardFn, Limit);
656}
657
658//===----------------------------------------------------------------------===//
659// No-op Hazard Detection
660//===----------------------------------------------------------------------===//
661
662static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
663 MCRegister Reg) {
664 for (MCRegUnit Unit : TRI.regunits(Reg))
665 BV.set(static_cast<unsigned>(Unit));
666}
667
668static void addRegsToSet(const SIRegisterInfo &TRI,
670 BitVector &DefSet, BitVector &UseSet) {
671 for (const MachineOperand &Op : Ops) {
672 if (Op.isReg())
673 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
674 }
675}
676
677void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
678 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
679}
680
682 return !SIInstrInfo::isSMRD(*MI);
683}
684
686 return !SIInstrInfo::isVMEM(*MI);
687}
688
689int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
690 // SMEM soft clause are only present on VI+, and only matter if xnack is
691 // enabled.
692 if (!ST.isXNACKEnabled())
693 return 0;
694
695 bool IsSMRD = TII.isSMRD(*MEM);
696
697 resetClause();
698
699 // A soft-clause is any group of consecutive SMEM instructions. The
700 // instructions in this group may return out of order and/or may be
701 // replayed (i.e. the same instruction issued more than once).
702 //
703 // In order to handle these situations correctly we need to make sure that
704 // when a clause has more than one instruction, no instruction in the clause
705 // writes to a register that is read by another instruction in the clause
706 // (including itself). If we encounter this situation, we need to break the
707 // clause by inserting a non SMEM instruction.
708
709 for (MachineInstr *MI : EmittedInstrs) {
710 // When we hit a non-SMEM instruction then we have passed the start of the
711 // clause and we can stop.
712 if (!MI)
713 break;
714
716 break;
717
718 addClauseInst(*MI);
719 }
720
721 if (ClauseDefs.none())
722 return 0;
723
724 // We need to make sure not to put loads and stores in the same clause if they
725 // use the same address. For now, just start a new clause whenever we see a
726 // store.
727 if (MEM->mayStore())
728 return 1;
729
730 addClauseInst(*MEM);
731
732 // If the set of defs and uses intersect then we cannot add this instruction
733 // to the clause, so we have a hazard.
734 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
735}
736
737int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
738 int WaitStatesNeeded = 0;
739
740 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
741
742 // This SMRD hazard only affects SI.
743 if (!ST.hasSMRDReadVALUDefHazard())
744 return WaitStatesNeeded;
745
746 // A read of an SGPR by SMRD instruction requires 4 wait states when the
747 // SGPR was written by a VALU instruction.
748 int SmrdSgprWaitStates = 4;
749 auto IsHazardDefFn = [this](const MachineInstr &MI) {
750 return TII.isVALU(MI);
751 };
752 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
753 return TII.isSALU(MI);
754 };
755
756 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
757
758 for (const MachineOperand &Use : SMRD->uses()) {
759 if (!Use.isReg())
760 continue;
761 int WaitStatesNeededForUse =
762 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
763 SmrdSgprWaitStates);
764 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
765
766 // This fixes what appears to be undocumented hardware behavior in SI where
767 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
768 // needs some number of nops in between. We don't know how many we need, but
769 // let's use 4. This wasn't discovered before probably because the only
770 // case when this happens is when we expand a 64-bit pointer into a full
771 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
772 // probably never encountered in the closed-source land.
773 if (IsBufferSMRD) {
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
776 IsBufferHazardDefFn,
777 SmrdSgprWaitStates);
778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
779 }
780 }
781
782 return WaitStatesNeeded;
783}
784
785int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
786 if (!ST.hasVMEMReadSGPRVALUDefHazard())
787 return 0;
788
789 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
790
791 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
792 // SGPR was written by a VALU Instruction.
793 const int VmemSgprWaitStates = 5;
794 auto IsHazardDefFn = [this](const MachineInstr &MI) {
795 return TII.isVALU(MI);
796 };
797 for (const MachineOperand &Use : VMEM->uses()) {
798 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
799 continue;
800
801 int WaitStatesNeededForUse =
802 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
803 VmemSgprWaitStates);
804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
805 }
806 return WaitStatesNeeded;
807}
808
809int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
810 const SIRegisterInfo *TRI = ST.getRegisterInfo();
811 const SIInstrInfo *TII = ST.getInstrInfo();
812
813 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
814 int DppVgprWaitStates = 2;
815 int DppExecWaitStates = 5;
816 int WaitStatesNeeded = 0;
817 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
818 return TII->isVALU(MI);
819 };
820
821 for (const MachineOperand &Use : DPP->uses()) {
822 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
823 continue;
824 int WaitStatesNeededForUse =
825 DppVgprWaitStates - getWaitStatesSinceDef(
826 Use.getReg(),
827 [](const MachineInstr &) { return true; },
828 DppVgprWaitStates);
829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
830 }
831
832 WaitStatesNeeded = std::max(
833 WaitStatesNeeded,
834 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
835 DppExecWaitStates));
836
837 return WaitStatesNeeded;
838}
839
840int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
841 const SIInstrInfo *TII = ST.getInstrInfo();
842
843 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
844 // instruction.
845 const int DivFMasWaitStates = 4;
846 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
847 return TII->isVALU(MI);
848 };
849 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
850 DivFMasWaitStates);
851
852 return DivFMasWaitStates - WaitStatesNeeded;
853}
854
855int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
856 const SIInstrInfo *TII = ST.getInstrInfo();
857 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
858
859 const int GetRegWaitStates = 2;
860 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
861 return GetRegHWReg == getHWReg(TII, MI);
862 };
863 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
864
865 return GetRegWaitStates - WaitStatesNeeded;
866}
867
868int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned HWReg = getHWReg(TII, *SetRegInstr);
871
872 const int SetRegWaitStates = ST.getSetRegWaitStates();
873 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
874 return HWReg == getHWReg(TII, MI);
875 };
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
877 return SetRegWaitStates - WaitStatesNeeded;
878}
879
880int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
881 if (!MI.mayStore())
882 return -1;
883
884 const SIInstrInfo *TII = ST.getInstrInfo();
885 unsigned Opcode = MI.getOpcode();
886 const MCInstrDesc &Desc = MI.getDesc();
887
888 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
889 int VDataRCID = -1;
890 if (VDataIdx != -1)
891 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
892
893 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
894 // There is no hazard if the instruction does not use vector regs
895 // (like wbinvl1)
896 if (VDataIdx == -1)
897 return -1;
898 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
899 // On gfx940-family the BUFFER_STORE source-vgpr WAR hazard exists for
900 // every SOFFSET shape; the wait-state count differs by SOFFSET, and is
901 // computed in checkVALUHazardsHelper. Pre-gfx940 the hazard only exists
902 // if soffset is not an SGPR.
903 if (ST.hasGFX940Insts())
904 return VDataIdx;
905 const MachineOperand *SOffset =
906 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
907 if (!SOffset || !SOffset->isReg())
908 return VDataIdx;
909 }
910 }
911
912 // MIMG instructions create a hazard if they don't use a 256-bit T# and
913 // the store size is greater than 8 bytes and they have more than two bits
914 // of their dmask set.
915 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
916 if (TII->isMIMG(MI)) {
917 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
918 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
919 Desc.operands()[SRsrcIdx])) == 256);
920 (void)SRsrcIdx;
921 }
922
923 if (TII->isFLAT(MI)) {
924 // There is no hazard if the instruction does not use vector regs
925 if (VDataIdx == -1)
926 return -1;
927
928 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
929 return VDataIdx;
930 }
931
932 return -1;
933}
934
935int GCNHazardRecognizer::checkVALUHazardsHelper(
936 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
937 // Helper to check for the hazard where VMEM instructions that store more
938 // than 8 bytes can have their store data overwritten by the next
939 // instruction. On gfx940-family the window depends on the producer's
940 // SOFFSET shape:
941 // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
942 // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
943 // store: 2 wait states.
944 // Pre-gfx940 keeps a single 1-wait-state window. The 1-cycle sgpr-SOFFSET
945 // window was measured on gfx950 (MI350X); the same gate is applied to the
946 // rest of the gfx940 family to match the existing rule's granularity.
947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
948 const SIInstrInfo *TII = ST.getInstrInfo();
949
950 int WaitStatesNeeded = 0;
951 if (!TRI->isVectorRegister(MRI, Def.getReg()))
952 return WaitStatesNeeded;
953 const Register Reg = Def.getReg();
954
955 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
956
957 // Per-producer required wait-state window. On pre-gfx940 every producer
958 // uses 1; on gfx940-family MUBUF/MTBUF stores with an SGPR SOFFSET use 1
959 // and everything else (literal/absent SOFFSET, FLAT) uses 2.
960 auto WindowFor = [this, TII](const MachineInstr &MI) -> int {
961 if (!ST.hasGFX940Insts())
962 return 1;
963 if (TII->isBUF(MI)) {
964 const MachineOperand *SOffset =
965 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
966 if (SOffset && SOffset->isReg())
967 return 1;
968 }
969 return 2;
970 };
971
972 // For each hazard producer reached, accumulate the wait states still
973 // needed using that producer's own window. The predicate always returns
974 // false so the walk runs to MaxWaitStates.
975 int Distance = 0;
976 auto Counter = [&](const MachineInstr &MI) {
977 int DataIdx = createsVALUHazard(MI);
978 if (DataIdx >= 0 &&
979 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg)) {
980 int Need = WindowFor(MI) - Distance;
981 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
982 }
983 // Mirror getWaitStatesSince's accounting, which does not count inline asm
984 // towards the wait-state distance.
985 if (!MI.isInlineAsm())
987 return false;
988 };
989 getWaitStatesSince(Counter, MaxWaitStates);
990
991 return WaitStatesNeeded;
992}
993
994/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
995/// pack the computed value into correct bit position of the dest register. This
996/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
997/// dst_sel that is not aligned to the register. This function analayzes the \p
998/// MI and \returns an operand with dst forwarding issue, or nullptr if
999/// none exists.
1000static const MachineOperand *
1002 if (!SIInstrInfo::isVALU(MI))
1003 return nullptr;
1004
1005 const SIInstrInfo *TII = ST.getInstrInfo();
1006
1007 unsigned Opcode = MI.getOpcode();
1008
1009 // There are three different types of instructions
1010 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
1011 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
1012 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
1013 // op_sel[3:2]
1014 // != 0
1015 if (SIInstrInfo::isSDWA(MI)) {
1016 // Type 1: SDWA with dst_sel != DWORD
1017 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
1018 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1019 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1020 }
1021
1022 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1023 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1024 // Type 2: VOP3 which write the hi bits
1025 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1027 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1028
1029 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1030 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1031 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1033 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1034 }
1035
1036 // Special case: nop is required for all the opsel values for fp4 sr variant
1037 // cvt scale instructions
1038 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1039 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1040
1041 return nullptr;
1042}
1043
1044/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1045/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1046/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1048 const MachineOperand *Dst,
1049 const SIRegisterInfo *TRI) {
1050 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1051 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1052 // and we must account for that hazard.
1053 // We also must account for WAW hazards. In particular, WAW with dest
1054 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1055 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1056 // check for ECC. Without accounting for this hazard, the ECC will be
1057 // wrong.
1058 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1059 // complete zeroesHigh16BitsOfDest)
1060 for (auto &Operand : VALU->operands()) {
1061 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1062 return true;
1063 }
1064 }
1065 return false;
1066}
1067
1068int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1069 int WaitStatesNeeded = 0;
1070
1071 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1072 const int TransDefWaitstates = 1;
1073
1074 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1076 return false;
1077 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1080
1081 for (const MachineOperand &Use : VALU->explicit_uses()) {
1082 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1083 return true;
1084 }
1085
1086 return false;
1087 };
1088
1089 int WaitStatesNeededForDef =
1090 TransDefWaitstates -
1091 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1093 }
1094
1095 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1096 const int Shift16DefWaitstates = 1;
1097
1098 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1100 const MachineOperand *ForwardedDst =
1101 getDstSelForwardingOperand(ProducerMI, ST);
1102 if (ForwardedDst) {
1103 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1104 }
1105
1106 if (ProducerMI.isInlineAsm()) {
1107 // Assume inline asm has dst forwarding hazard
1108 for (auto &Def : ProducerMI.all_defs()) {
1109 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1110 return true;
1111 }
1112 }
1113
1114 return false;
1115 };
1116
1117 int WaitStatesNeededForDef =
1118 Shift16DefWaitstates -
1119 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1121 }
1122
1123 if (ST.hasVDecCoExecHazard()) {
1124 const int VALUWriteSGPRVALUReadWaitstates = 2;
1125 const int VALUWriteEXECRWLane = 4;
1126 const int VALUWriteVGPRReadlaneRead = 1;
1127
1128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1129 const MachineRegisterInfo &MRI = MF.getRegInfo();
1131 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1132 if (!SIInstrInfo::isVALU(MI))
1133 return false;
1134 return MI.modifiesRegister(UseReg, TRI);
1135 };
1136
1137 for (const MachineOperand &Use : VALU->explicit_uses()) {
1138 if (!Use.isReg())
1139 continue;
1140
1141 UseReg = Use.getReg();
1142 if (TRI->isSGPRReg(MRI, UseReg)) {
1143 int WaitStatesNeededForDef =
1144 VALUWriteSGPRVALUReadWaitstates -
1145 getWaitStatesSince(IsVALUDefSGPRFn,
1146 VALUWriteSGPRVALUReadWaitstates);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1148 }
1149 }
1150
1151 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1152 UseReg = AMDGPU::VCC;
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1157 }
1158
1159 switch (VALU->getOpcode()) {
1160 case AMDGPU::V_READLANE_B32:
1161 case AMDGPU::V_READFIRSTLANE_B32: {
1162 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1163 UseReg = Src->getReg();
1164 int WaitStatesNeededForDef =
1165 VALUWriteVGPRReadlaneRead -
1166 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1168 }
1169 [[fallthrough]];
1170 case AMDGPU::V_WRITELANE_B32: {
1171 UseReg = AMDGPU::EXEC;
1172 int WaitStatesNeededForDef =
1173 VALUWriteEXECRWLane -
1174 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1175 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1176 break;
1177 }
1178 default:
1179 break;
1180 }
1181 }
1182
1183 // This checks for the hazard where VMEM instructions that store more than
1184 // 8 bytes can have there store data over written by the next instruction.
1185 if (!ST.has12DWordStoreHazard())
1186 return WaitStatesNeeded;
1187
1188 const MachineRegisterInfo &MRI = MF.getRegInfo();
1189
1190 for (const MachineOperand &Def : VALU->defs()) {
1191 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1192 }
1193
1194 return WaitStatesNeeded;
1195}
1196
1197int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1198 // This checks for hazards associated with inline asm statements.
1199 // Since inline asms can contain just about anything, we use this
1200 // to call/leverage other check*Hazard routines. Note that
1201 // this function doesn't attempt to address all possible inline asm
1202 // hazards (good luck), but is a collection of what has been
1203 // problematic thus far.
1204
1205 // see checkVALUHazards()
1206 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1207 !ST.hasCvtScaleForwardingHazard())
1208 return 0;
1209
1210 const MachineRegisterInfo &MRI = MF.getRegInfo();
1211 int WaitStatesNeeded = 0;
1212
1213 for (const MachineOperand &Op :
1215 if (Op.isReg() && Op.isDef()) {
1216 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1217 continue;
1218
1219 if (ST.has12DWordStoreHazard()) {
1220 WaitStatesNeeded =
1221 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1222 }
1223 }
1224 }
1225
1226 if (ST.hasDstSelForwardingHazard()) {
1227 const int Shift16DefWaitstates = 1;
1228
1229 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1230 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1231 // Assume inline asm reads the dst
1232 if (Dst)
1233 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1234 IA->readsRegister(Dst->getReg(), &TRI);
1235
1236 if (ProducerMI.isInlineAsm()) {
1237 // If MI is inline asm, assume it has dst forwarding hazard
1238 for (auto &Def : ProducerMI.all_defs()) {
1239 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1240 IA->readsRegister(Def.getReg(), &TRI)) {
1241 return true;
1242 }
1243 }
1244 }
1245
1246 return false;
1247 };
1248
1249 int WaitStatesNeededForDef =
1250 Shift16DefWaitstates -
1251 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1252 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1253 }
1254
1255 return WaitStatesNeeded;
1256}
1257
1258int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1259 const SIInstrInfo *TII = ST.getInstrInfo();
1260 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1261 const MachineRegisterInfo &MRI = MF.getRegInfo();
1262
1263 const MachineOperand *LaneSelectOp =
1264 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1265
1266 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1267 return 0;
1268
1269 Register LaneSelectReg = LaneSelectOp->getReg();
1270 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1271
1272 const int RWLaneWaitStates = 4;
1273 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1274 RWLaneWaitStates);
1275 return RWLaneWaitStates - WaitStatesSince;
1276}
1277
1278int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1279 if (!ST.hasRFEHazards())
1280 return 0;
1281
1282 const SIInstrInfo *TII = ST.getInstrInfo();
1283
1284 const int RFEWaitStates = 1;
1285
1286 auto IsHazardFn = [TII](const MachineInstr &MI) {
1287 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1288 };
1289 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1290 return RFEWaitStates - WaitStatesNeeded;
1291}
1292
1293int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1294 const SIInstrInfo *TII = ST.getInstrInfo();
1295 const int ReadM0WaitStates = 1;
1296 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1297 return ReadM0WaitStates -
1298 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1299}
1300
1301void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1303 int WaitStatesNeeded, bool IsHoisting) {
1304 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1305 for (int I = 0; I < WaitStatesNeeded; ++I)
1306 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1307}
1308
1309void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1310 fixVMEMtoScalarWriteHazards(MI);
1311 fixVcmpxPermlaneHazards(MI);
1312 fixSMEMtoVectorWriteHazards(MI);
1313 fixVcmpxExecWARHazard(MI);
1314 fixLdsBranchVmemWARHazard(MI);
1315 if (ST.hasLdsDirect()) {
1316 fixLdsDirectVALUHazard(MI);
1317 fixLdsDirectVMEMHazard(MI);
1318 }
1319 fixVALUPartialForwardingHazard(MI);
1320 fixVALUTransUseHazard(MI);
1321 fixVALUTransCoexecutionHazards(MI);
1322 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1323 fixWMMACoexecutionHazards(MI);
1324 fixShift64HighRegBug(MI);
1325 fixVALUMaskWriteHazard(MI);
1326 fixRequiredExportPriority(MI);
1327 if (ST.requiresWaitIdleBeforeGetReg())
1328 fixGetRegWaitIdle(MI);
1329 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1330 fixDsAtomicAsyncBarrierArriveB64(MI);
1331 if (ST.hasScratchBaseForwardingHazard())
1332 fixScratchBaseForwardingHazard(MI);
1333 if (ST.setRegModeNeedsVNOPs())
1334 fixSetRegMode(MI);
1335}
1336
1338 const MachineInstr &MI) {
1339 return (TII.isVOPC(MI) ||
1340 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1341 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1342}
1343
1344bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1345 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1346 return false;
1347
1348 const SIInstrInfo *TII = ST.getInstrInfo();
1349 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1350 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1351 return isVCmpXWritesExec(*TII, *TRI, MI);
1352 };
1353
1354 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1355 unsigned Opc = MI.getOpcode();
1356 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1357 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1358 };
1359
1360 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1361 std::numeric_limits<int>::max())
1362 return false;
1363
1364 // V_NOP will be discarded by SQ.
1365 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1366 // which is always a VGPR and available.
1367 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1368 Register Reg = Src0->getReg();
1369 bool IsUndef = Src0->isUndef();
1370 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1371 TII->get(AMDGPU::V_MOV_B32_e32))
1374
1375 return true;
1376}
1377
1378bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1379 if (!ST.hasVMEMtoScalarWriteHazard())
1380 return false;
1381 assert(!ST.hasExtendedWaitCounts());
1382
1384 return false;
1385
1386 if (MI->getNumDefs() == 0)
1387 return false;
1388
1389 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1390
1391 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1393 return false;
1394
1395 for (const MachineOperand &Def : MI->defs()) {
1396 const MachineOperand *Op =
1397 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1398 if (!Op)
1399 continue;
1400 return true;
1401 }
1402 return false;
1403 };
1404
1405 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1406 return SIInstrInfo::isVALU(MI) ||
1407 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1408 !MI.getOperand(0).getImm()) ||
1409 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1410 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1411 };
1412
1413 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1414 std::numeric_limits<int>::max())
1415 return false;
1416
1417 const SIInstrInfo *TII = ST.getInstrInfo();
1418 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1419 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1421 return true;
1422}
1423
1424bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1425 if (!ST.hasSMEMtoVectorWriteHazard())
1426 return false;
1427 assert(!ST.hasExtendedWaitCounts());
1428
1429 if (!SIInstrInfo::isVALU(*MI))
1430 return false;
1431
1432 AMDGPU::OpName SDSTName;
1433 switch (MI->getOpcode()) {
1434 case AMDGPU::V_READLANE_B32:
1435 case AMDGPU::V_READFIRSTLANE_B32:
1436 SDSTName = AMDGPU::OpName::vdst;
1437 break;
1438 default:
1439 SDSTName = AMDGPU::OpName::sdst;
1440 break;
1441 }
1442
1443 const SIInstrInfo *TII = ST.getInstrInfo();
1444 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1445 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1446 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1447 if (!SDST) {
1448 for (const auto &MO : MI->implicit_operands()) {
1449 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1450 SDST = &MO;
1451 break;
1452 }
1453 }
1454 }
1455
1456 if (!SDST)
1457 return false;
1458
1459 const Register SDSTReg = SDST->getReg();
1460 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1461 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1462 };
1463
1464 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1465 if (TII->isSALU(MI)) {
1466 switch (MI.getOpcode()) {
1467 case AMDGPU::S_SETVSKIP:
1468 case AMDGPU::S_VERSION:
1469 case AMDGPU::S_WAITCNT_VSCNT:
1470 case AMDGPU::S_WAITCNT_VMCNT:
1471 case AMDGPU::S_WAITCNT_EXPCNT:
1472 // These instructions cannot not mitigate the hazard.
1473 return false;
1474 case AMDGPU::S_WAITCNT_LGKMCNT:
1475 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1476 return (MI.getOperand(1).getImm() == 0) &&
1477 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1478 case AMDGPU::S_WAITCNT: {
1479 const int64_t Imm = MI.getOperand(0).getImm();
1480 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1481 // DsCnt corresponds to LGKMCnt here.
1482 return Decoded.get(AMDGPU::DS_CNT) == 0;
1483 }
1484 default:
1485 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1486 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1487 "unexpected wait count instruction");
1488 // SOPP instructions cannot mitigate the hazard.
1489 if (TII->isSOPP(MI))
1490 return false;
1491 // At this point the SALU can be assumed to mitigate the hazard
1492 // because either:
1493 // (a) it is independent of the at risk SMEM (breaking chain),
1494 // or
1495 // (b) it is dependent on the SMEM, in which case an appropriate
1496 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1497 // SMEM instruction.
1498 return true;
1499 }
1500 }
1501 return false;
1502 };
1503
1504 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1505 std::numeric_limits<int>::max())
1506 return false;
1507
1508 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1509 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1510 .addImm(0);
1511 return true;
1512}
1513
1514bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1515 if (!ST.hasVcmpxExecWARHazard())
1516 return false;
1517 assert(!ST.hasExtendedWaitCounts());
1518
1519 if (!SIInstrInfo::isVALU(*MI))
1520 return false;
1521
1522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1523 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1524 return false;
1525
1526 auto IsHazardFn = [TRI](const MachineInstr &I) {
1528 return false;
1529 return I.readsRegister(AMDGPU::EXEC, TRI);
1530 };
1531
1532 const SIInstrInfo *TII = ST.getInstrInfo();
1533 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1534 if (SIInstrInfo::isVALU(MI)) {
1535 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1536 return true;
1537 for (auto MO : MI.implicit_operands())
1538 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1539 return true;
1540 }
1541 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1542 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1543 return true;
1544 return false;
1545 };
1546
1547 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1548 std::numeric_limits<int>::max())
1549 return false;
1550
1551 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1552 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1554 return true;
1555}
1556
1558 const GCNSubtarget &ST) {
1559 if (!ST.hasLdsBranchVmemWARHazard())
1560 return false;
1561
1562 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1563 // instructions need to appear in the same function.
1564 bool HasLds = false;
1565 bool HasVmem = false;
1566 for (auto &MBB : MF) {
1567 for (auto &MI : MBB) {
1569 HasVmem |= SIInstrInfo::isVMEM(MI);
1570 if (HasLds && HasVmem)
1571 return true;
1572 }
1573 }
1574 return false;
1575}
1576
1578 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1580 !I.getOperand(1).getImm();
1581}
1582
1583bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1584 if (!RunLdsBranchVmemWARHazardFixup)
1585 return false;
1586
1587 assert(ST.hasLdsBranchVmemWARHazard());
1588 assert(!ST.hasExtendedWaitCounts());
1589
1590 auto IsHazardInst = [](const MachineInstr &MI) {
1592 return 1;
1594 return 2;
1595 return 0;
1596 };
1597
1598 auto InstType = IsHazardInst(*MI);
1599 if (!InstType)
1600 return false;
1601
1602 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1603 return IsHazardInst(I) || isStoreCountWaitZero(I);
1604 };
1605
1606 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1607 if (!I.isBranch())
1608 return false;
1609
1610 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1611 auto InstType2 = IsHazardInst(I);
1612 return InstType2 && InstType != InstType2;
1613 };
1614
1615 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1616 auto InstType2 = IsHazardInst(I);
1617 if (InstType == InstType2)
1618 return true;
1619
1620 return isStoreCountWaitZero(I);
1621 };
1622
1623 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1624 std::numeric_limits<int>::max();
1625 };
1626
1627 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1628 std::numeric_limits<int>::max())
1629 return false;
1630
1631 const SIInstrInfo *TII = ST.getInstrInfo();
1632 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1633 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1634 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1635 .addImm(0);
1636
1637 return true;
1638}
1639
1640bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1642 return false;
1643
1644 const int NoHazardWaitStates = 15;
1645 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1646 const Register VDSTReg = VDST->getReg();
1647
1648 bool VisitedTrans = false;
1649 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1650 if (!SIInstrInfo::isVALU(I))
1651 return false;
1652 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1653 // Cover both WAR and WAW
1654 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1655 };
1656 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1657 if (WaitStates >= NoHazardWaitStates)
1658 return true;
1659 // Instructions which cause va_vdst==0 expire hazard
1662 };
1663 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1664 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1665 };
1666
1667 DenseSet<const MachineBasicBlock *> Visited;
1668 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1669 std::next(MI->getReverseIterator()), 0,
1670 IsExpiredFn, Visited, GetWaitStatesFn);
1671
1672 // Transcendentals can execute in parallel to other VALUs.
1673 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1674 if (VisitedTrans)
1675 Count = 0;
1676
1677 MachineOperand *WaitVdstOp =
1678 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1679 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1680
1681 return true;
1682}
1683
1684bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1686 return false;
1687
1688 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1689 const Register VDSTReg = VDST->getReg();
1690
1691 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1693 return false;
1694 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1695 };
1696 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1697 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1698 // according to the type of VMEM instruction.
1699 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1701 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1702 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1703 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1704 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1705 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1706 };
1707
1708 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1709 std::numeric_limits<int>::max())
1710 return false;
1711
1712 if (LdsdirCanWait) {
1713 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1714 } else {
1715 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1716 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1718 }
1719
1720 return true;
1721}
1722
1723bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1724 if (!ST.hasVALUPartialForwardingHazard())
1725 return false;
1726 assert(!ST.hasExtendedWaitCounts());
1727
1728 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1729 return false;
1730
1731 SmallSetVector<Register, 4> SrcVGPRs;
1732
1733 for (const MachineOperand &Use : MI->explicit_uses()) {
1734 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1735 SrcVGPRs.insert(Use.getReg());
1736 }
1737
1738 // Only applies with >= 2 unique VGPR sources
1739 if (SrcVGPRs.size() <= 1)
1740 return false;
1741
1742 // Look for the following pattern:
1743 // Va <- VALU [PreExecPos]
1744 // intv1
1745 // Exec <- SALU [ExecPos]
1746 // intv2
1747 // Vb <- VALU [PostExecPos]
1748 // intv3
1749 // MI Va, Vb (WaitState = 0)
1750 //
1751 // Where:
1752 // intv1 + intv2 <= 2 VALUs
1753 // intv3 <= 4 VALUs
1754 //
1755 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1756
1757 const int Intv1plus2MaxVALUs = 2;
1758 const int Intv3MaxVALUs = 4;
1759 const int IntvMaxVALUs = 6;
1760 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1761
1762 struct StateType {
1763 SmallDenseMap<Register, int, 4> DefPos;
1764 int ExecPos = std::numeric_limits<int>::max();
1765 int VALUs = 0;
1766
1767 static unsigned getHashValue(const StateType &State) {
1768 return hash_combine(State.ExecPos, State.VALUs,
1769 hash_combine_range(State.DefPos));
1770 }
1771 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1772 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1773 LHS.VALUs == RHS.VALUs;
1774 }
1775 };
1776
1777 StateType State;
1778
1779 // This overloads expiry testing with all the hazard detection
1780 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1781 // Too many VALU states have passed
1782 if (State.VALUs > NoHazardVALUWaitStates)
1783 return HazardExpired;
1784
1785 // Instructions which cause va_vdst==0 expire hazard
1788 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1789 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1790 return HazardExpired;
1791
1792 // Track registers writes
1793 bool Changed = false;
1794 if (SIInstrInfo::isVALU(I)) {
1795 for (Register Src : SrcVGPRs) {
1796 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1797 State.DefPos[Src] = State.VALUs;
1798 Changed = true;
1799 }
1800 }
1801 } else if (SIInstrInfo::isSALU(I)) {
1802 if (State.ExecPos == std::numeric_limits<int>::max()) {
1803 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1804 State.ExecPos = State.VALUs;
1805 Changed = true;
1806 }
1807 }
1808 }
1809
1810 // Early expiration: too many VALUs in intv3
1811 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1812 return HazardExpired;
1813
1814 // Only evaluate state if something changed
1815 if (!Changed)
1816 return NoHazardFound;
1817
1818 // Determine positions of VALUs pre/post exec change
1819 if (State.ExecPos == std::numeric_limits<int>::max())
1820 return NoHazardFound;
1821
1822 int PreExecPos = std::numeric_limits<int>::max();
1823 int PostExecPos = std::numeric_limits<int>::max();
1824
1825 for (auto Entry : State.DefPos) {
1826 int DefVALUs = Entry.second;
1827 if (DefVALUs != std::numeric_limits<int>::max()) {
1828 if (DefVALUs >= State.ExecPos)
1829 PreExecPos = std::min(PreExecPos, DefVALUs);
1830 else
1831 PostExecPos = std::min(PostExecPos, DefVALUs);
1832 }
1833 }
1834
1835 // Need a VALUs post exec change
1836 if (PostExecPos == std::numeric_limits<int>::max())
1837 return NoHazardFound;
1838
1839 // Too many VALUs in intv3?
1840 int Intv3VALUs = PostExecPos;
1841 if (Intv3VALUs > Intv3MaxVALUs)
1842 return HazardExpired;
1843
1844 // Too many VALUs in intv2?
1845 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1846 if (Intv2VALUs > Intv1plus2MaxVALUs)
1847 return HazardExpired;
1848
1849 // Need a VALUs pre exec change
1850 if (PreExecPos == std::numeric_limits<int>::max())
1851 return NoHazardFound;
1852
1853 // Too many VALUs in intv1?
1854 int Intv1VALUs = PreExecPos - State.ExecPos;
1855 if (Intv1VALUs > Intv1plus2MaxVALUs)
1856 return HazardExpired;
1857
1858 // Too many VALUs in intv1 + intv2
1859 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1860 return HazardExpired;
1861
1862 return HazardFound;
1863 };
1864 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1866 State.VALUs += 1;
1867 };
1868
1869 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1870 std::next(MI->getReverseIterator())))
1871 return false;
1872
1873 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1874 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1876
1877 return true;
1878}
1879
1880bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1881 if (!ST.hasVALUTransUseHazard())
1882 return false;
1883 assert(!ST.hasExtendedWaitCounts());
1884
1885 if (!SIInstrInfo::isVALU(*MI))
1886 return false;
1887
1888 SmallSet<Register, 4> SrcVGPRs;
1889
1890 for (const MachineOperand &Use : MI->explicit_uses()) {
1891 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1892 SrcVGPRs.insert(Use.getReg());
1893 }
1894
1895 // Look for the following pattern:
1896 // Va <- TRANS VALU
1897 // intv
1898 // MI Va (WaitState = 0)
1899 //
1900 // Where:
1901 // intv <= 5 VALUs / 1 TRANS
1902 //
1903 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1904
1905 const int IntvMaxVALUs = 5;
1906 const int IntvMaxTRANS = 1;
1907
1908 struct StateType {
1909 int VALUs = 0;
1910 int TRANS = 0;
1911
1912 static unsigned getHashValue(const StateType &State) {
1913 return hash_combine(State.VALUs, State.TRANS);
1914 }
1915 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1916 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1917 }
1918 };
1919
1920 StateType State;
1921
1922 // This overloads expiry testing with all the hazard detection
1923 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1924 // Too many VALU states have passed
1925 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1926 return HazardExpired;
1927
1928 // Instructions which cause va_vdst==0 expire hazard
1931 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1932 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1933 return HazardExpired;
1934
1935 // Track registers writes
1936 if (SIInstrInfo::isTRANS(I)) {
1937 for (Register Src : SrcVGPRs) {
1938 if (I.modifiesRegister(Src, &TRI)) {
1939 return HazardFound;
1940 }
1941 }
1942 }
1943
1944 return NoHazardFound;
1945 };
1946 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1948 State.VALUs += 1;
1950 State.TRANS += 1;
1951 };
1952
1953 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1954 std::next(MI->getReverseIterator())))
1955 return false;
1956
1957 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1958 // avoided.
1959 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1960 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1962
1963 return true;
1964}
1965
1966bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1967 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1969 return false;
1970
1971 const SIInstrInfo *TII = ST.getInstrInfo();
1972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1973
1974 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1975 if (!SIInstrInfo::isTRANS(I))
1976 return false;
1977
1978 // RAW: Trans(I) writes, VALU(MI) reads.
1979 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1980 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1981 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1982 return true;
1983 }
1984
1985 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1986 if (!ValuDst || !ValuDst->isReg())
1987 return false;
1988
1989 // WAR: Trans(I) reads, VALU(MI) writes.
1990 Register ValuDef = ValuDst->getReg();
1991 for (const MachineOperand &TransUse : I.explicit_uses()) {
1992 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1993 return true;
1994 }
1995
1996 return false;
1997 };
1998
1999 auto IsExpiredFn = [](const MachineInstr &I, int) {
2000 return SIInstrInfo::isVALU(I);
2001 };
2002
2003 const int HasVALU = std::numeric_limits<int>::max();
2004 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
2005 return false;
2006
2007 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2008 return true;
2009}
2010
2011bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
2013 return false;
2014
2015 const SIInstrInfo *TII = ST.getInstrInfo();
2016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2017
2018 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2020 return false;
2021
2022 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2023 // with the dest(matrix D) of the previous wmma.
2024 const Register CurSrc0Reg =
2025 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2026 const Register CurSrc1Reg =
2027 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2028
2029 const Register PrevDstReg =
2030 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2031
2032 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2033 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2034 return true;
2035 }
2036
2037 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2038 // but Index can't overlap with PrevDstReg.
2039 if (AMDGPU::isGFX12Plus(ST)) {
2040 if (SIInstrInfo::isSWMMAC(*MI)) {
2041 const Register CurIndex =
2042 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2043 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2044 return true;
2045 }
2046 return false;
2047 }
2048
2049 return false;
2050 };
2051
2052 auto IsExpiredFn = [](const MachineInstr &I, int) {
2053 return SIInstrInfo::isVALU(I);
2054 };
2055
2056 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2057 std::numeric_limits<int>::max())
2058 return false;
2059
2060 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2061
2062 return true;
2063}
2064
2069
2070// Classify XDL WMMA instructions into co-execution hazard categories
2071// (Refer to SPG 4.6.12.1), mainly based on instruction latency.
2072//
2073// Category 0: WMMA with Latency 8
2074// WMMA_*F16, WMMA_*BF16
2075// WMMA_*FP8FP8
2076// WMMA_*FP8BF8
2077// WMMA_*BF8FP8
2078// WMMA_*BF8BF8
2079// WMMA_*F8F6F4 if SRCA & SRCB != F8
2080//
2081// Category 1: WMMA Latency 16
2082// WMMA_IU8
2083// WMMA_*F8F6F4 if SRCA OR SRCB == F8
2084//
2085// Category 2: SWMMAC with Latency 8
2086// SWMMAC_*F16, SWMMAC_*BF16,
2087// SWMMAC_*FP8FP8
2088// SWMMAC_*BF8FP8
2089// SWMMAC_*FP8BF8
2090// SWMMAC_*BF8BF8
2091//
2092// Category 3: SWMMAC with Latency 16
2093// SWMMAC_IU8
2094static unsigned
2096 const TargetSchedModel &SchedModel) {
2097 assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
2098 bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
2099 unsigned Category = 0;
2100
2101 unsigned Latency = SchedModel.computeInstrLatency(&MI);
2102 switch (Latency) {
2103 case 8:
2104 Category = IsSWMMAC ? 2 : 0;
2105 break;
2106 case 16:
2107 Category = IsSWMMAC ? 3 : 1;
2108 break;
2109 default:
2110 llvm_unreachable("unexpected xdl wmma latency");
2111 } // end switch.
2112
2113 return Category;
2114}
2115
2116int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2117 if (!ST.hasGFX1250Insts())
2118 return 0;
2119
2120 const SIInstrInfo *TII = ST.getInstrInfo();
2121 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2122 return 0;
2123
2124 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2125 // be in between the first WMMA and the second instruction to cover the hazard
2126 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2127 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2128 // numbers, which depends on the category of the first WMMA.
2129 const int WMMAWaitStates[] = {5, 9, 3, 5};
2130 const int VALUWaitStates[] = {4, 8, 2, 4};
2131 unsigned Category = 0;
2132
2133 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2134 if (!TII->isXDLWMMA(I))
2135 return false;
2136
2137 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
2138 return hasWMMAToWMMARegOverlap(I, *MI);
2139 };
2140
2141 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2142 if (!TII->isXDLWMMA(I))
2143 return false;
2144
2145 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
2146 return hasWMMAToVALURegOverlap(I, *MI);
2147 };
2148
2149 auto GetWaitStatesFn = [](const MachineInstr &I) {
2150 return SIInstrInfo::isVALU(I) ? 1 : 0;
2151 };
2152
2153 int WaitStatesNeeded = -1;
2154 int ExistingVALUs = 0; // Existing number of VALU ops in between.
2155
2156 // getWaitStatesSince checks for a hazard between instruction 'I' and 'MI':
2157 // - If a hazard exists: returns the number of VALUs in between and sets
2158 // 'Category' via IsWMMAHazardFn/IsVALUHazardFn for instruction 'I'.
2159 // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
2160 // so no V_NOP insertion is needed.
2161 if (TII->isXDLWMMA(*MI)) {
2162 const int WMMAWaitsLimit = 9; // Maximum of WMMAWaitStates
2163 ExistingVALUs =
2164 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2165 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2166 } else { // Must be a co-executable VALU.
2167 const int VALUWaitsLimit = 8; // Maximum of VALUWaitStates
2168 ExistingVALUs =
2169 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2170 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2171 }
2172
2173 return WaitStatesNeeded;
2174}
2175
2176bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2177 const MachineInstr &WMMA, const MachineInstr &MI) const {
2178 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2179 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2180 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2181
2182 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2183 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2184 return true;
2185
2187 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2188 if (TRI.regsOverlap(D0, Idx1))
2189 return true;
2190 }
2191 return false;
2192}
2193
2194bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2195 const MachineInstr &WMMA, const MachineInstr &MI) const {
2196 // WMMA writes, VALU reads.
2197 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2198 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2199 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2200 return true;
2201 }
2202
2203 // WMMA reads or writes, VALU writes.
2204 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2205 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2206 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2207
2208 if (SIInstrInfo::isSWMMAC(WMMA)) {
2209 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2210 WMMARegs.push_back(Idx0);
2211 }
2212
2213 for (const MachineOperand &ValuDef : MI.defs()) {
2214 Register VDstReg = ValuDef.getReg();
2215 for (Register WMMAReg : WMMARegs) {
2216 if (TRI.regsOverlap(VDstReg, WMMAReg))
2217 return true;
2218 }
2219 }
2220 return false;
2221}
2222
2223bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2224 const MachineInstr &MI) const {
2225 // I is the potential WMMA hazard source, MI is the instruction being checked
2226 // for hazard.
2227 if (!TII.isXDLWMMA(I))
2228 return false;
2229
2230 // Dispatch based on MI type
2231 if (TII.isXDLWMMA(MI))
2232 return hasWMMAToWMMARegOverlap(I, MI);
2234 return hasWMMAToVALURegOverlap(I, MI);
2235
2236 return false;
2237}
2238
2239bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2240 bool IncludeSubloops) {
2241 // Scan loop for any WMMA that hazards MI.
2242 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2243 for (MachineBasicBlock *MBB : L->getBlocks()) {
2244 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2245 continue;
2246 for (MachineInstr &I : *MBB) {
2247 if (&I == MI)
2248 continue;
2249 if (isCoexecutionHazardFor(I, *MI))
2250 return true;
2251 }
2252 }
2253 return false;
2254}
2255
2256bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2257 int WaitStatesNeeded) {
2258 if (!MLI)
2259 return false;
2260
2261 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2262 if (!L) {
2263 ++NumWMMAHoistingBailed;
2264 return false;
2265 }
2266
2267 // If innermost loop has WMMA hazard, we can't hoist at all
2268 if (hasWMMAHazardInLoop(L, MI)) {
2269 ++NumWMMAHoistingBailed;
2270 return false;
2271 }
2272
2273 // Find outermost loop with no internal hazard
2274 MachineLoop *TargetLoop = L;
2275 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2276 if (hasWMMAHazardInLoop(Parent, MI, false))
2277 break; // Parent has hazard in its own blocks, stop here
2278 TargetLoop = Parent; // Safe to hoist further out
2279 }
2280
2281 // Need valid preheader to insert V_NOPs
2282 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2283 if (!Preheader) {
2284 ++NumWMMAHoistingBailed;
2285 return false;
2286 }
2287
2288 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2289 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2290 << "\n");
2291
2292 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2293 /*IsHoisting=*/true);
2294 NumWMMANopsHoisted += WaitStatesNeeded;
2295 return true;
2296}
2297
2298bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2299 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2300 if (WaitStatesNeeded <= 0)
2301 return false;
2302
2303 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2304 return true;
2305
2306 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2307 return true;
2308}
2309
2310bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2311 if (!ST.hasShift64HighRegBug())
2312 return false;
2313 assert(!ST.hasExtendedWaitCounts());
2314
2315 switch (MI->getOpcode()) {
2316 default:
2317 return false;
2318 case AMDGPU::V_LSHLREV_B64_e64:
2319 case AMDGPU::V_LSHRREV_B64_e64:
2320 case AMDGPU::V_ASHRREV_I64_e64:
2321 break;
2322 }
2323
2324 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2325 if (!Amt->isReg())
2326 return false;
2327
2328 Register AmtReg = Amt->getReg();
2329 const MachineRegisterInfo &MRI = MF.getRegInfo();
2330 // Check if this is a last VGPR in the allocation block.
2331 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2332 return false;
2333
2334 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2335 return false;
2336
2337 assert(ST.needsAlignedVGPRs());
2338 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2339
2340 const DebugLoc &DL = MI->getDebugLoc();
2341 MachineBasicBlock *MBB = MI->getParent();
2342 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2343
2344 // In:
2345 //
2346 // Dst = shiftrev64 Amt, Src1
2347 //
2348 // if Dst!=Src1 then avoid the bug with:
2349 //
2350 // Dst.sub0 = Amt
2351 // Dst = shift64 Dst.sub0, Src1
2352
2353 Register DstReg = MI->getOperand(0).getReg();
2354 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2355 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2356 runOnInstruction(
2357 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2358 Amt->setReg(DstLo);
2359 Amt->setIsKill(true);
2360 return true;
2361 }
2362
2363 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2364 Register NewReg;
2365 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2366 : AMDGPU::VGPR_32RegClass) {
2367 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2368 NewReg = Reg;
2369 break;
2370 }
2371 }
2372
2373 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2374 : NewReg;
2375 Register NewAmtLo;
2376
2377 if (Overlapped)
2378 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2379
2380 // Insert a full wait count because found register might be pending a wait.
2381 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2382 .addImm(0);
2383
2384 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2385 if (Overlapped)
2386 runOnInstruction(
2387 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2388 .addDef(AmtReg - 1)
2389 .addReg(AmtReg - 1, RegState::Undef)
2390 .addReg(NewAmtLo, RegState::Undef));
2391 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2392 .addDef(AmtReg)
2393 .addReg(AmtReg, RegState::Undef)
2394 .addReg(NewAmt, RegState::Undef));
2395
2396 // Instructions emitted after the current instruction will be processed by the
2397 // parent loop of the hazard recognizer in a natural way.
2398 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2399 AmtReg)
2400 .addDef(NewAmt)
2401 .addReg(NewAmt)
2402 .addReg(AmtReg);
2403 if (Overlapped)
2404 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2405 AmtReg - 1)
2406 .addDef(NewAmtLo)
2407 .addReg(NewAmtLo)
2408 .addReg(AmtReg - 1);
2409
2410 // Re-running hazard recognizer on the modified instruction is not necessary,
2411 // inserted V_SWAP_B32 has already both read and write new registers so
2412 // hazards related to these register has already been handled.
2413 Amt->setReg(NewAmt);
2414 Amt->setIsKill(false);
2415 // We do not update liveness, so verifier may see it as undef.
2416 Amt->setIsUndef();
2417 if (Overlapped) {
2418 MI->getOperand(0).setReg(NewReg);
2419 Src1->setReg(NewReg);
2420 Src1->setIsKill(false);
2421 Src1->setIsUndef();
2422 }
2423
2424 return true;
2425}
2426
2427int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2428 int NSAtoVMEMWaitStates = 1;
2429
2430 if (!ST.hasNSAtoVMEMBug())
2431 return 0;
2432
2434 return 0;
2435
2436 const SIInstrInfo *TII = ST.getInstrInfo();
2437 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2438 if (!Offset || (Offset->getImm() & 6) == 0)
2439 return 0;
2440
2441 auto IsHazardFn = [TII](const MachineInstr &I) {
2442 if (!SIInstrInfo::isMIMG(I))
2443 return false;
2444 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2445 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2446 TII->getInstSizeInBytes(I) >= 16;
2447 };
2448
2449 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2450}
2451
2452int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2453 MachineInstr *MI) const {
2454 int FPAtomicToDenormModeWaitStates = 3;
2455
2456 if (!ST.hasFPAtomicToDenormModeHazard())
2457 return 0;
2458 assert(!ST.hasExtendedWaitCounts());
2459
2460 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2461 return 0;
2462
2463 auto IsHazardFn = [](const MachineInstr &I) {
2464 if (!SIInstrInfo::isVMEM(I))
2465 return false;
2466 return SIInstrInfo::isFPAtomic(I);
2467 };
2468
2469 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2470 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2471 return true;
2472
2473 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2474 };
2475
2476 return FPAtomicToDenormModeWaitStates -
2477 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2478}
2479
2480int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2482
2483 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2484}
2485
2486int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2487 // Early exit if no padding is requested.
2488 if (MFMAPaddingRatio == 0)
2489 return 0;
2490
2491 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2492 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2493 return 0;
2494
2495 int NeighborMFMALatency = 0;
2496 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2497 this](const MachineInstr &MI) {
2498 if (!SIInstrInfo::isMFMA(MI))
2499 return false;
2500
2501 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2502 return true;
2503 };
2504
2505 const int MaxMFMAPipelineWaitStates = 16;
2506 int WaitStatesSinceNeighborMFMA =
2507 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2508
2509 int NeighborMFMAPaddingNeeded =
2510 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2511 WaitStatesSinceNeighborMFMA;
2512
2513 return std::max(0, NeighborMFMAPaddingNeeded);
2514}
2515
2516int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2517 int WaitStatesNeeded = 0;
2518 unsigned Opc = MI->getOpcode();
2519
2520 auto IsVALUFn = [](const MachineInstr &MI) {
2521 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2522 };
2523
2524 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2525 const int LegacyVALUWritesVGPRWaitStates = 2;
2526 const int VALUWritesExecWaitStates = 4;
2527 const int MaxWaitStates = 4;
2528
2529 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2530 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2532
2533 if (WaitStatesNeeded < MaxWaitStates) {
2534 for (const MachineOperand &Use : MI->explicit_uses()) {
2535 const int MaxWaitStates = 2;
2536
2537 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2538 continue;
2539
2540 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2541 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2542 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2543
2544 if (WaitStatesNeeded == MaxWaitStates)
2545 break;
2546 }
2547 }
2548 }
2549
2550 for (const MachineOperand &Op : MI->explicit_operands()) {
2551 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2552 continue;
2553
2554 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2555 continue;
2556
2557 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2558 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2559 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2560 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2561 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2562 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2563 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2564 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2565 const int MaxWaitStates = 18;
2566 Register Reg = Op.getReg();
2567 unsigned HazardDefLatency = 0;
2568
2569 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2570 this](const MachineInstr &MI) {
2571 if (!SIInstrInfo::isMFMA(MI))
2572 return false;
2573 Register DstReg = MI.getOperand(0).getReg();
2574 if (DstReg == Reg)
2575 return false;
2576 HazardDefLatency =
2577 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2578 return TRI.regsOverlap(DstReg, Reg);
2579 };
2580
2581 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2582 MaxWaitStates);
2583 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2584 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2585 int OpNo = Op.getOperandNo();
2586 if (OpNo == SrcCIdx) {
2587 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2588 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2589 switch (HazardDefLatency) {
2590 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2591 break;
2592 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2593 break;
2594 case 16: [[fallthrough]];
2595 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2596 break;
2597 }
2598 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2599 switch (HazardDefLatency) {
2600 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2601 break;
2602 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2603 break;
2604 case 16: [[fallthrough]];
2605 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2606 break;
2607 }
2608 }
2609
2610 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2611 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2612
2613 if (WaitStatesNeeded == MaxWaitStates)
2614 return WaitStatesNeeded; // Early exit.
2615
2616 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2617 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2618 return false;
2619 Register DstReg = MI.getOperand(0).getReg();
2620 return TRI.regsOverlap(Reg, DstReg);
2621 };
2622
2623 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2624 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2625 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2626 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2627 if (OpNo == SrcCIdx)
2628 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2629 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2630 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2631
2632 WaitStatesNeededForUse = NeedWaitStates -
2633 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2635
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 return WaitStatesNeeded; // Early exit.
2638 }
2639
2640 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2641 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2642 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2643 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2644 const int MaxWaitStates = 13;
2645 Register DstReg = MI->getOperand(0).getReg();
2646 unsigned HazardDefLatency = 0;
2647
2648 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2649 this](const MachineInstr &MI) {
2650 if (!SIInstrInfo::isMFMA(MI))
2651 return false;
2652 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2653 HazardDefLatency =
2654 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2655 return TRI.regsOverlap(Reg, DstReg);
2656 };
2657
2658 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2659 int NeedWaitStates;
2660 switch (HazardDefLatency) {
2661 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2662 break;
2663 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2664 break;
2665 case 16: [[fallthrough]];
2666 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2667 break;
2668 }
2669
2670 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2671 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2672 }
2673
2674 // Pad neighboring MFMA with noops for better inter-wave performance.
2675 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2676
2677 return WaitStatesNeeded;
2678}
2679
2680static int
2682 bool IsGFX950) {
2683 // xdl def cycles | gfx940 | gfx950
2684 // 2 pass | 3 4
2685 // 4 pass | 5 6
2686 // 8 pass | 9 10
2687 // 16 pass | 17 18
2688 return NumPasses + 1 + IsGFX950;
2689}
2690
2691static int
2693 bool IsGFX950) {
2694 // xdl def cycles | gfx940 | gfx950
2695 // 2 pass | 3 3
2696 // 4 pass | 5 6
2697 // 8 pass | 9 10
2698 // 16 pass | 17 18
2699 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2700}
2701
2702static int
2704 // 2 pass -> 2
2705 // 4 pass -> 4
2706 // 8 pass -> 8
2707 // 16 pass -> 16
2708 return NumPasses;
2709}
2710
2711static int
2713 // 2 pass -> 4
2714 // 4 pass -> 6
2715 // 8 pass -> 10
2716 // 16 pass -> 18
2717 return NumPasses + 2;
2718}
2719
2721 bool IsGFX950) {
2722 // xdl def cycles | gfx942 | gfx950
2723 // 2 pass | 5 5
2724 // 4 pass | 7 8
2725 // 8 pass | 11 12
2726 // 16 pass | 19 20
2727 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2728}
2729
2730int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2731 int WaitStatesNeeded = 0;
2732 unsigned Opc = MI->getOpcode();
2733
2734 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2736 };
2737
2738 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2741 };
2742
2743 if (!SIInstrInfo::isMFMA(*MI))
2744 return WaitStatesNeeded;
2745
2746 const int VALUWritesExecWaitStates = 4;
2747 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2748 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2749 VALUWritesExecWaitStates);
2750 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2751
2752 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2753
2754 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2755 for (const MachineOperand &Use : MI->explicit_uses()) {
2756 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2757 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2758 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2759 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2760 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2761 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2762 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2763 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2764 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2765 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2766 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2767 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2768 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2769 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2770 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2771 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2772 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2773 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2774 const int MaxWaitStates = 19;
2775
2776 if (!Use.isReg())
2777 continue;
2778 Register Reg = Use.getReg();
2779 bool FullReg;
2780 const MachineInstr *MI1;
2781
2782 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2783 this](const MachineInstr &MI) {
2784 if (!SIInstrInfo::isMFMA(MI))
2785 return false;
2786 Register DstReg = MI.getOperand(0).getReg();
2787 FullReg = (DstReg == Reg);
2788 MI1 = &MI;
2789 return TRI.regsOverlap(DstReg, Reg);
2790 };
2791
2792 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2793 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2794 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2795
2796 int NumWaitStates =
2797 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2798 if (NumWaitStates == std::numeric_limits<int>::max())
2799 continue;
2800
2801 int OpNo = Use.getOperandNo();
2802 unsigned Opc1 = MI1->getOpcode();
2803 int NeedWaitStates = 0;
2804 if (OpNo == SrcCIdx) {
2805 if (!SIInstrInfo::isDGEMM(Opc) &&
2806 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2807 NeedWaitStates = 0;
2808 } else if (FullReg) {
2809 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2810 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2811 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2812 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2813 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2814 else if (ST.hasGFX940Insts() &&
2815 TSchedModel.computeInstrLatency(MI1) == 2)
2816 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2817 } else {
2818 switch (Opc1) {
2819 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2820 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2821 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2822 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2823 if (!TII.isXDL(*MI))
2824 NeedWaitStates =
2825 ST.hasGFX950Insts()
2826 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2827 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2828 break;
2829 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2830 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2831 if (!TII.isXDL(*MI))
2832 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2833 break;
2834 default:
2835 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2836 if (ST.hasGFX940Insts()) {
2837 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2838 break;
2839
2840 NeedWaitStates =
2841 TII.isXDL(*MI1)
2842 ? (TII.isXDL(*MI)
2844 NumPasses, ST.hasGFX950Insts())
2846 NumPasses, ST.hasGFX950Insts()))
2848 NumPasses);
2849 break;
2850 }
2851
2852 switch (NumPasses) {
2853 case 2:
2854 NeedWaitStates =
2856 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2857 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2858 break;
2859 case 8:
2860 NeedWaitStates =
2862 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2863 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2864 break;
2865 case 16:
2866 NeedWaitStates =
2868 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2869 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2870 break;
2871 default:
2872 llvm_unreachable("unexpected number of passes");
2873 }
2874 }
2875 }
2876 } else {
2877 switch (Opc1) {
2878 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2879 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2880 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2881 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2882 NeedWaitStates =
2883 ST.hasGFX950Insts()
2884 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2885 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2886 break;
2887 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2888 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2889 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2890 break;
2891 default:
2892 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2893
2894 if (ST.hasGFX940Insts()) {
2895 NeedWaitStates =
2896 TII.isXDL(*MI1)
2898 NumPasses, ST.hasGFX950Insts())
2900 NumPasses);
2901 break;
2902 }
2903
2904 switch (NumPasses) {
2905 case 2:
2906 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2907 break;
2908 case 4:
2909 llvm_unreachable("unexpected number of passes for mfma");
2910 case 8:
2911 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2912 break;
2913 case 16:
2914 default:
2915 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2916 }
2917 }
2918 }
2919 if (WaitStatesNeeded >= NeedWaitStates)
2920 continue;
2921
2922 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2923 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2924
2925 if (WaitStatesNeeded == MaxWaitStates)
2926 break;
2927 }
2928
2929 // Pad neighboring MFMA with noops for better inter-wave performance.
2930 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2931
2932 return WaitStatesNeeded;
2933}
2934
2935int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2936 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2937 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2938 return 0;
2939
2940 int WaitStatesNeeded = 0;
2941
2942 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2943 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2944 };
2945
2946 for (const MachineOperand &Op : MI->explicit_uses()) {
2947 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2948 continue;
2949
2950 Register Reg = Op.getReg();
2951
2952 const int AccVgprReadLdStWaitStates = 2;
2953 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2954 const int MaxWaitStates = 2;
2955
2956 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2957 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2958 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2959
2960 if (WaitStatesNeeded == MaxWaitStates)
2961 return WaitStatesNeeded; // Early exit.
2962
2963 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2964 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2965 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2966 return false;
2967 auto IsVALUFn = [](const MachineInstr &MI) {
2969 };
2970 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2971 std::numeric_limits<int>::max();
2972 };
2973
2974 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2975 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2977 }
2978
2979 return WaitStatesNeeded;
2980}
2981
2982int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2983 assert(!ST.hasVcmpxPermlaneHazard() &&
2984 "this is a different vcmpx+permlane hazard");
2985 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2986 const SIInstrInfo *TII = ST.getInstrInfo();
2987
2988 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2989 return isVCmpXWritesExec(*TII, *TRI, MI);
2990 };
2991
2992 auto IsVALUFn = [](const MachineInstr &MI) {
2993 return SIInstrInfo::isVALU(MI);
2994 };
2995
2996 const int VCmpXWritesExecWaitStates = 4;
2997 const int VALUWritesVDstWaitStates = 2;
2998 int WaitStatesNeeded = 0;
2999
3000 for (const MachineOperand &Op : MI->explicit_uses()) {
3001 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
3002 continue;
3003 Register Reg = Op.getReg();
3004
3005 int WaitStatesSinceDef =
3006 VALUWritesVDstWaitStates -
3007 getWaitStatesSinceDef(Reg, IsVALUFn,
3008 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
3009 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3010 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3011 break;
3012 }
3013
3014 int VCmpXHazardWaits =
3015 VCmpXWritesExecWaitStates -
3016 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3017
3018 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3019 return WaitStatesNeeded;
3020}
3021
3023 // 2 pass -> 4
3024 // 4 pass -> 6
3025 // 8 pass -> 10
3026 // 16 pass -> 18
3027 return NumPasses + 2;
3028}
3029
3031 bool IsGFX950) {
3032 // xdl def cycles | gfx942 | gfx950
3033 // 2 pass | 5 5
3034 // 4 pass | 7 8
3035 // 8 pass | 11 12
3036 // 16 pass | 19 20
3037 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3038}
3039
3041 bool IsGFX950) {
3042 // xdl def cycles | gfx942 | gfx950
3043 // 2 pass | 5 5
3044 // 4 pass | 7 8
3045 // 8 pass | 11 12
3046 // 16 pass | 19 20
3047 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3048}
3049
3051 // 2 pass -> 4
3052 // 4 pass -> 6
3053 // 8 pass -> 10
3054 // 16 pass -> 18
3055 return NumPasses + 2;
3056}
3057
3058int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3059 if (!ST.hasGFX90AInsts())
3060 return 0;
3061
3062 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3063 return SIInstrInfo::isDGEMM(MI.getOpcode());
3064 };
3065
3066 // This is checked in checkMAIHazards90A()
3067 if (SIInstrInfo::isMFMA(*MI))
3068 return 0;
3069
3070 const MachineRegisterInfo &MRI = MF.getRegInfo();
3071
3072 int WaitStatesNeeded = 0;
3073
3074 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3075 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3076 bool IsVALU = SIInstrInfo::isVALU(*MI);
3077
3078 const MachineInstr *MFMA = nullptr;
3079 unsigned Reg;
3080 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3081 if (!SIInstrInfo::isMFMA(MI) ||
3082 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3083 return false;
3084 MFMA = &MI;
3085 return true;
3086 };
3087
3088 const MachineInstr *DOT = nullptr;
3089 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3090 if (!SIInstrInfo::isDOT(MI) ||
3091 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3092 return false;
3093 DOT = &MI;
3094 return true;
3095 };
3096
3097 bool DGEMMAfterVALUWrite = false;
3098 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3099 // Found DGEMM on reverse traversal to def.
3100 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3101 DGEMMAfterVALUWrite = true;
3102
3103 // Only hazard if register is defined by a VALU and a DGEMM is found after
3104 // after the def.
3105 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3106 return false;
3107
3108 return true;
3109 };
3110
3111 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3112 AMDGPU::OpName::src2);
3113
3114 if (IsMemOrExport || IsVALU) {
3115 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3116 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3117 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3118 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3119 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3120 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3121 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3122 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3123 const int DotWriteSameDotReadSrcAB = 3;
3124 const int DotWriteDifferentVALURead = 3;
3125 const int DMFMABetweenVALUWriteVMEMRead = 2;
3126 const int MaxWaitStates = 19;
3127
3128 for (const MachineOperand &Use : MI->explicit_uses()) {
3129 if (!Use.isReg())
3130 continue;
3131 Reg = Use.getReg();
3132
3133 DOT = nullptr;
3134 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3135 MaxWaitStates);
3136 if (DOT) {
3137 int NeedWaitStates = 0;
3138 if (DOT->getOpcode() == MI->getOpcode()) {
3139 if (&Use - &MI->getOperand(0) != SrcCIdx)
3140 NeedWaitStates = DotWriteSameDotReadSrcAB;
3141 } else {
3142 NeedWaitStates = DotWriteDifferentVALURead;
3143 }
3144
3145 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3146 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3147 }
3148
3149 // Workaround for HW data hazard bug observed only in GFX90A. When there
3150 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3151 // causes the SQ to incorrectly not insert two wait states between the two
3152 // instructions needed to avoid data hazard.
3153 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3154 DGEMMAfterVALUWrite = false;
3155 if (TRI.isVectorRegister(MRI, Reg)) {
3156 int WaitStatesNeededForUse =
3157 DMFMABetweenVALUWriteVMEMRead -
3158 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3159 DMFMABetweenVALUWriteVMEMRead);
3160
3161 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3162 }
3163 }
3164
3165 MFMA = nullptr;
3166 WaitStatesSinceDef =
3167 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3168 if (!MFMA)
3169 continue;
3170
3171 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3172 int NumPasses = HazardDefLatency;
3173 int NeedWaitStates = MaxWaitStates;
3174
3175 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3176 switch (HazardDefLatency) {
3177 case 4:
3178 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3179 : DMFMA4x4WriteVgprVALUReadWaitStates;
3180 break;
3181 case 8:
3182 case 16:
3183 NeedWaitStates =
3184 IsMemOrExport
3185 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3186 : (ST.hasGFX950Insts()
3187 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3188 : DMFMA16x16WriteVgprVALUReadWaitStates);
3189 break;
3190 default:
3191 llvm_unreachable("unexpected dgemm");
3192 }
3193 } else if (ST.hasGFX940Insts()) {
3194 NeedWaitStates =
3195 TII.isXDL(*MFMA)
3197 NumPasses, ST.hasGFX950Insts())
3199 NumPasses);
3200 } else {
3201 switch (HazardDefLatency) {
3202 case 2:
3203 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3204 break;
3205 case 8:
3206 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3207 break;
3208 case 16:
3209 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3210 break;
3211 default:
3212 llvm_unreachable("unexpected number of passes for mfma");
3213 }
3214 }
3215
3216 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3217 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3218
3219 if (WaitStatesNeeded == MaxWaitStates)
3220 break;
3221 }
3222 }
3223
3224 unsigned Opc = MI->getOpcode();
3225 const int DMFMAToFMA64WaitStates = 2;
3226 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3227 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3228 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3229 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3230 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3231 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3233 }
3234
3235 if (!IsVALU && !IsMemOrExport)
3236 return WaitStatesNeeded;
3237
3238 for (const MachineOperand &Def : MI->defs()) {
3239 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3240 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3241 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3242 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3243 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3244 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3245 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3246 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3247 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3248 const int DotWriteDifferentVALUWrite = 3;
3249 const int MaxWaitStates = 19;
3250 const int MaxWarWaitStates = 15;
3251
3252 Reg = Def.getReg();
3253
3254 DOT = nullptr;
3255 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3256 MaxWaitStates);
3257 if (DOT && DOT->getOpcode() != MI->getOpcode())
3258 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3259 WaitStatesSinceDef);
3260
3261 MFMA = nullptr;
3262 WaitStatesSinceDef =
3263 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3264 if (MFMA) {
3265 int NeedWaitStates = MaxWaitStates;
3266 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3267
3268 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3269 switch (NumPasses) {
3270 case 4:
3271 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3272 break;
3273 case 8:
3274 case 16:
3275 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3276 break;
3277 default:
3278 llvm_unreachable("unexpected number of cycles for dgemm");
3279 }
3280 } else if (ST.hasGFX940Insts()) {
3281 NeedWaitStates =
3282 TII.isXDL(*MFMA)
3284 NumPasses, ST.hasGFX950Insts())
3286 } else {
3287 switch (NumPasses) {
3288 case 2:
3289 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3290 break;
3291 case 8:
3292 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3293 break;
3294 case 16:
3295 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3296 break;
3297 default:
3298 llvm_unreachable("Unexpected number of passes for mfma");
3299 }
3300 }
3301
3302 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3303 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3304
3305 if (WaitStatesNeeded == MaxWaitStates)
3306 break;
3307 }
3308
3309 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3310 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3311 !MI.readsRegister(Reg, &TRI))
3312 return false;
3313
3314 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3315 return false;
3316
3317 const MachineOperand *SrcC =
3318 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3319 assert(SrcC);
3320 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3321 return false;
3322
3323 MFMA = &MI;
3324 return true;
3325 };
3326
3327 MFMA = nullptr;
3328 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3329 MaxWarWaitStates);
3330 if (!MFMA)
3331 continue;
3332
3333 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3334 int NeedWaitStates = MaxWaitStates;
3335 switch (HazardDefLatency) {
3336 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3337 break;
3338 case 4: assert(ST.hasGFX940Insts());
3339 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3340 break;
3341 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3342 break;
3343 case 16: [[fallthrough]];
3344 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3345 break;
3346 }
3347
3348 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3349 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3350 }
3351
3352 return WaitStatesNeeded;
3353}
3354
3356 if (!SU->isInstr())
3357 return false;
3358
3359 const MachineInstr *MAI = nullptr;
3360
3361 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3362 MAI = nullptr;
3364 MAI = &MI;
3365 return MAI != nullptr;
3366 };
3367
3368 MachineInstr *MI = SU->getInstr();
3369 if (IsMFMAFn(*MI)) {
3370 int W = getWaitStatesSince(IsMFMAFn, 16);
3371 if (MAI)
3372 return W < (int)TSchedModel.computeInstrLatency(MAI);
3373 }
3374
3375 return false;
3376}
3377
3378// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3379// insertion of a new instruction.
3380static void updateGetPCBundle(MachineInstr *NewMI) {
3381 if (!NewMI->isBundled())
3382 return;
3383
3384 // Find start of bundle.
3385 auto I = NewMI->getIterator();
3386 while (I->isBundledWithPred())
3387 I--;
3388 if (I->isBundle())
3389 I++;
3390
3391 // Bail if this is not an S_GETPC bundle.
3392 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3393 return;
3394
3395 // Update offsets of any references in the bundle.
3396 const unsigned NewBytes = 4;
3397 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3398 "Unexpected instruction insertion in bundle");
3399 auto NextMI = std::next(NewMI->getIterator());
3400 auto End = NewMI->getParent()->end();
3401 while (NextMI != End && NextMI->isBundledWithPred()) {
3402 for (auto &Operand : NextMI->operands()) {
3403 if (Operand.isGlobal())
3404 Operand.setOffset(Operand.getOffset() + NewBytes);
3405 }
3406 NextMI++;
3407 }
3408}
3409
3410bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3411 if (!ST.hasVALUMaskWriteHazard())
3412 return false;
3413 assert(!ST.hasExtendedWaitCounts());
3414
3415 if (!ST.isWave64())
3416 return false;
3417
3418 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3419 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3420 if (!IsSALU && !IsVALU)
3421 return false;
3422
3423 // The hazard sequence is three instructions:
3424 // 1. VALU reads SGPR as mask
3425 // 2. VALU/SALU writes SGPR
3426 // 3. VALU/SALU reads SGPR
3427 // The hazard can expire if the distance between 2 and 3 is sufficient,
3428 // or (2) is VALU and (3) is SALU.
3429 // In practice this happens <10% of the time, hence always assume the hazard
3430 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3431
3432 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3433 const MachineRegisterInfo &MRI = MF.getRegInfo();
3434
3435 auto IgnoreableSGPR = [](const Register Reg) {
3436 switch (Reg) {
3437 case AMDGPU::EXEC:
3438 case AMDGPU::EXEC_LO:
3439 case AMDGPU::EXEC_HI:
3440 case AMDGPU::M0:
3441 case AMDGPU::SGPR_NULL:
3442 case AMDGPU::SGPR_NULL64:
3443 case AMDGPU::SCC:
3444 return true;
3445 default:
3446 return false;
3447 }
3448 };
3449 auto IsVCC = [](const Register Reg) {
3450 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3451 };
3452
3453 struct StateType {
3454 SmallSet<Register, 2> HazardSGPRs;
3455
3456 static unsigned getHashValue(const StateType &State) {
3457 return hash_combine_range(State.HazardSGPRs);
3458 }
3459 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3460 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3461 }
3462 };
3463
3464 SmallVector<const MachineInstr *> WaitInstrs;
3465 bool HasSGPRRead = false;
3466 StateType InitialState;
3467
3468 // Look for SGPR write.
3469 MachineOperand *HazardDef = nullptr;
3470 for (MachineOperand &Op : MI->operands()) {
3471 if (!Op.isReg())
3472 continue;
3473 if (Op.isDef() && HazardDef)
3474 continue;
3475
3476 Register Reg = Op.getReg();
3477 if (IgnoreableSGPR(Reg))
3478 continue;
3479 if (!IsVCC(Reg)) {
3480 if (Op.isImplicit())
3481 continue;
3482 if (!TRI->isSGPRReg(MRI, Reg))
3483 continue;
3484 }
3485 // Also check for SGPR reads.
3486 if (Op.isUse()) {
3487 HasSGPRRead = true;
3488 continue;
3489 }
3490
3491 assert(!HazardDef);
3492 HazardDef = &Op;
3493 }
3494
3495 if (!HazardDef)
3496 return false;
3497
3498 // Setup to track writes to individual SGPRs
3499 const Register HazardReg = HazardDef->getReg();
3500 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3501 InitialState.HazardSGPRs.insert(HazardReg);
3502 } else {
3503 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3504 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3505 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3506 }
3507
3508 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3509 if (State.HazardSGPRs.empty())
3510 return HazardExpired;
3511
3512 switch (I.getOpcode()) {
3513 case AMDGPU::V_ADDC_U32_e32:
3514 case AMDGPU::V_ADDC_U32_dpp:
3515 case AMDGPU::V_CNDMASK_B16_t16_e32:
3516 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3517 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3518 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3519 case AMDGPU::V_CNDMASK_B32_e32:
3520 case AMDGPU::V_CNDMASK_B32_dpp:
3521 case AMDGPU::V_DIV_FMAS_F32_e64:
3522 case AMDGPU::V_DIV_FMAS_F64_e64:
3523 case AMDGPU::V_SUBB_U32_e32:
3524 case AMDGPU::V_SUBB_U32_dpp:
3525 case AMDGPU::V_SUBBREV_U32_e32:
3526 case AMDGPU::V_SUBBREV_U32_dpp: {
3527 // These implicitly read VCC as mask source.
3528 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3529 }
3530 case AMDGPU::V_ADDC_U32_e64:
3531 case AMDGPU::V_ADDC_U32_e64_dpp:
3532 case AMDGPU::V_CNDMASK_B16_t16_e64:
3533 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3534 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3535 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3536 case AMDGPU::V_CNDMASK_B32_e64:
3537 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3538 case AMDGPU::V_SUBB_U32_e64:
3539 case AMDGPU::V_SUBB_U32_e64_dpp:
3540 case AMDGPU::V_SUBBREV_U32_e64:
3541 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3542 // Only check mask register overlaps.
3543 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3544 assert(SSRCOp);
3545 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3546 return Result ? HazardFound : NoHazardFound;
3547 }
3548 default:
3549 return NoHazardFound;
3550 }
3551 };
3552
3553 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3555 0),
3556 0);
3557 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3558 switch (I.getOpcode()) {
3559 case AMDGPU::S_WAITCNT_DEPCTR:
3560 // Record mergable waits within region of instructions free of SGPR reads.
3561 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3562 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3563 WaitInstrs.push_back(&I);
3564 break;
3565 default:
3566 // Update tracking of SGPR reads and writes.
3567 for (auto &Op : I.operands()) {
3568 if (!Op.isReg())
3569 continue;
3570
3571 Register Reg = Op.getReg();
3572 if (IgnoreableSGPR(Reg))
3573 continue;
3574 if (!IsVCC(Reg)) {
3575 if (Op.isImplicit())
3576 continue;
3577 if (!TRI->isSGPRReg(MRI, Reg))
3578 continue;
3579 }
3580 if (Op.isUse()) {
3581 HasSGPRRead = true;
3582 continue;
3583 }
3584
3585 // Stop tracking any SGPRs with writes on the basis that they will
3586 // already have an appropriate wait inserted afterwards.
3588 for (Register SGPR : State.HazardSGPRs) {
3589 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3590 Found.push_back(SGPR);
3591 }
3592 for (Register SGPR : Found)
3593 State.HazardSGPRs.erase(SGPR);
3594 }
3595 break;
3596 }
3597 };
3598
3599 // Check for hazard
3600 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3601 MI->getParent(),
3602 std::next(MI->getReverseIterator())))
3603 return false;
3604
3605 // Compute counter mask
3606 unsigned DepCtr =
3607 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3608 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3609 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3610
3611 // Try to merge previous waits into this one for regions with no SGPR reads.
3612 if (!WaitInstrs.empty()) {
3613 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3614 // obtain a mutable pointer to each instruction to be merged.
3615 // This is expected to be a very short walk within the same block.
3616 SmallVector<MachineInstr *> ToErase;
3617 unsigned Found = 0;
3618 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3619 End = MI->getParent()->rend();
3620 Found < WaitInstrs.size() && It != End; ++It) {
3621 MachineInstr *WaitMI = &*It;
3622 // Find next wait instruction.
3623 if (std::as_const(WaitMI) != WaitInstrs[Found])
3624 continue;
3625 Found++;
3626 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3627 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3628 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3629 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3630 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3631 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3632 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3633 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3634 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3635 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3636 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3637 ToErase.push_back(WaitMI);
3638 }
3639 assert(Found == WaitInstrs.size());
3640 for (MachineInstr *WaitMI : ToErase)
3641 WaitMI->eraseFromParent();
3642 }
3643
3644 // Add s_waitcnt_depctr after SGPR write.
3645 auto NextMI = std::next(MI->getIterator());
3646 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3647 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3648 .addImm(DepCtr);
3649
3650 // SALU write may be s_getpc in a bundle.
3651 updateGetPCBundle(NewMI);
3652
3653 return true;
3654}
3655
3656static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3657 const SIInstrInfo &TII) {
3658 MachineBasicBlock &EntryMBB = MF->front();
3659 if (EntryMBB.begin() != EntryMBB.end()) {
3660 auto &EntryMI = *EntryMBB.begin();
3661 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3662 EntryMI.getOperand(0).getImm() >= Priority)
3663 return false;
3664 }
3665
3666 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3667 .addImm(Priority);
3668 return true;
3669}
3670
3671bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3672 if (!ST.hasRequiredExportPriority())
3673 return false;
3674
3675 // Assume the following shader types will never have exports,
3676 // and avoid adding or adjusting S_SETPRIO.
3677 MachineBasicBlock *MBB = MI->getParent();
3678 MachineFunction *MF = MBB->getParent();
3679 auto CC = MF->getFunction().getCallingConv();
3680 switch (CC) {
3685 return false;
3686 default:
3687 break;
3688 }
3689
3690 const int MaxPriority = 3;
3691 const int NormalPriority = 2;
3692 const int PostExportPriority = 0;
3693
3694 auto It = MI->getIterator();
3695 switch (MI->getOpcode()) {
3696 case AMDGPU::S_ENDPGM:
3697 case AMDGPU::S_ENDPGM_SAVED:
3698 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3699 case AMDGPU::SI_RETURN_TO_EPILOG:
3700 // Ensure shader with calls raises priority at entry.
3701 // This ensures correct priority if exports exist in callee.
3702 if (MF->getFrameInfo().hasCalls())
3703 return ensureEntrySetPrio(MF, NormalPriority, TII);
3704 return false;
3705 case AMDGPU::S_SETPRIO: {
3706 // Raise minimum priority unless in workaround.
3707 auto &PrioOp = MI->getOperand(0);
3708 int Prio = PrioOp.getImm();
3709 bool InWA = (Prio == PostExportPriority) &&
3710 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3711 if (InWA || Prio >= NormalPriority)
3712 return false;
3713 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3714 return true;
3715 }
3716 default:
3717 if (!TII.isEXP(*MI))
3718 return false;
3719 break;
3720 }
3721
3722 // Check entry priority at each export (as there will only be a few).
3723 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3724 bool Changed = false;
3726 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3727
3728 auto NextMI = std::next(It);
3729 bool EndOfShader = false;
3730 if (NextMI != MBB->end()) {
3731 // Only need WA at end of sequence of exports.
3732 if (TII.isEXP(*NextMI))
3733 return Changed;
3734 // Assume appropriate S_SETPRIO after export means WA already applied.
3735 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3736 NextMI->getOperand(0).getImm() == PostExportPriority)
3737 return Changed;
3738 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3739 }
3740
3741 const DebugLoc &DL = MI->getDebugLoc();
3742
3743 // Lower priority.
3744 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3745 .addImm(PostExportPriority);
3746
3747 if (!EndOfShader) {
3748 // Wait for exports to complete.
3749 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3750 .addReg(AMDGPU::SGPR_NULL)
3751 .addImm(0);
3752 }
3753
3754 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3755 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3756
3757 if (!EndOfShader) {
3758 // Return to normal (higher) priority.
3759 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3760 .addImm(NormalPriority);
3761 }
3762
3763 return true;
3764}
3765
3766bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3767 if (!isSGetReg(MI->getOpcode()))
3768 return false;
3769
3770 const SIInstrInfo *TII = ST.getInstrInfo();
3771 switch (getHWReg(TII, *MI)) {
3772 default:
3773 return false;
3778 break;
3779 }
3780
3781 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3782 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3783 .addImm(0);
3784 return true;
3785}
3786
3787bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3788 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3789 return false;
3790
3791 const SIInstrInfo *TII = ST.getInstrInfo();
3792 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3793 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3795 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3796 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3798
3799 return true;
3800}
3801
3802bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3803 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3804 // for hazard to trigger.
3805 if (!IsHazardRecognizerMode)
3806 return false;
3807
3808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3809 const SIInstrInfo *TII = ST.getInstrInfo();
3810 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3811 const int FlatScrBaseWaitStates = 10;
3812
3813 bool ReadsFlatScrLo =
3814 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3815 bool ReadsFlatScrHi =
3816 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3817 if (isSGetReg(MI->getOpcode())) {
3818 switch (getHWReg(TII, *MI)) {
3819 default:
3820 break;
3822 ReadsFlatScrLo = true;
3823 break;
3825 ReadsFlatScrHi = true;
3826 break;
3827 }
3828 }
3829
3830 const MachineRegisterInfo &MRI = MF.getRegInfo();
3831
3832 auto IsRegDefHazard = [&](Register Reg) -> bool {
3833 DenseSet<const MachineBasicBlock *> Visited;
3834 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3835 return MI.modifiesRegister(Reg, TRI);
3836 };
3837
3838 // This literally abuses the idea of waitstates. Instead of waitstates it
3839 // returns 1 for SGPR written and 0 otherwise.
3840 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3841 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3842 return 0;
3843 for (const MachineOperand &MO : MI.all_defs()) {
3844 if (TRI->isSGPRReg(MRI, MO.getReg()))
3845 return 1;
3846 }
3847 return 0;
3848 };
3849
3850 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3851 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3852 unsigned Wait = MI.getOperand(0).getImm();
3855 return true;
3856 }
3857 return SgprWrites >= FlatScrBaseWaitStates;
3858 };
3859
3860 return ::getWaitStatesSince(
3861 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3862 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3863 };
3864
3865 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3866 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3867 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3868 !IsRegDefHazard(AMDGPU::SGPR103)))
3869 return false;
3870
3871 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3872 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3875 return true;
3876}
3877
3878bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3879 if (!isSSetReg(MI->getOpcode()) ||
3880 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3881 return false;
3882
3883 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3884 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3885 return true;
3886}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:319
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition MCSchedule.h:35
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...