LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
18#include "llvm/ADT/Statistic.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(0, Value))
42 return O.error("'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
60 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
64 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
72 const GCNSubtarget &ST);
73
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
85 EmittedInstrs.clear();
86}
87
91
93 CurrCycleInstr = MI;
94}
95
96static bool isDivFMas(unsigned Opcode) {
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
98}
99
100static bool isSGetReg(unsigned Opcode) {
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
102}
103
104static bool isSSetReg(unsigned Opcode) {
105 switch (Opcode) {
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
110 return true;
111 }
112 return false;
113}
114
115static bool isRWLane(unsigned Opcode) {
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
117}
118
119static bool isRFE(unsigned Opcode) {
120 return Opcode == AMDGPU::S_RFE_B64;
121}
122
123static bool isSMovRel(unsigned Opcode) {
124 switch (Opcode) {
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
129 return true;
130 default:
131 return false;
132 }
133}
134
136 const MachineInstr &MI) {
137 if (TII.isAlwaysGDS(MI.getOpcode()))
138 return true;
139
140 switch (MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
144 return true;
145 // These DS opcodes don't support GDS.
146 case AMDGPU::DS_NOP:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
149 return false;
150 default:
151 if (TII.isDS(MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (MI.getOperand(GDS).getImm())
155 return true;
156 }
157 return false;
158 }
159}
160
161static bool isPermlane(const MachineInstr &MI) {
162 unsigned Opcode = MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
177}
178
179static bool isLdsDma(const MachineInstr &MI) {
180 return SIInstrInfo::isVALU(MI) &&
182}
183
184static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
185 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
186 AMDGPU::OpName::simm16);
187 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
188}
189
192 MachineInstr *MI = SU->getInstr();
193 // If we are not in "HazardRecognizerMode" and therefore not being run from
194 // the scheduler, track possible stalls from hazards but don't insert noops.
195 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
196
197 if (MI->isBundle())
198 return NoHazard;
199
200 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
204 return HazardType;
205
206 if (checkFPAtomicToDenormModeHazard(MI) > 0)
207 return HazardType;
208
209 // Hazards which cannot be mitigated with S_NOPs.
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(MI) > 0)
212 return Hazard;
213 }
214
215 if (ST.hasNoDataDepHazard())
216 return NoHazard;
217
218 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
219 return HazardType;
220
221 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
222 return HazardType;
223
224 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
225 return HazardType;
226
227 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
228 return HazardType;
229
230 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
231 return HazardType;
232
235 checkMAIVALUHazards(MI) > 0)
236 return HazardType;
237
238 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
239 return HazardType;
240
241 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
242 return HazardType;
243
244 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
245 return HazardType;
246
247 if (((ST.hasReadM0MovRelInterpHazard() &&
248 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
252 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
255 checkReadM0Hazards(MI) > 0)
256 return HazardType;
257
258 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
259 return HazardType;
260
262 checkMAILdStHazards(MI) > 0)
263 return HazardType;
264
265 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
266 return HazardType;
267
268 return NoHazard;
269}
270
272 unsigned Quantity) {
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
275 Quantity -= Arg;
276 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
277 .addImm(Arg - 1);
278 }
279}
280
281unsigned
282GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
287}
288
289void GCNHazardRecognizer::processBundle() {
290 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
291 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
292 // Check bundled MachineInstr's for hazards.
293 for (; MI != E && MI->isInsideBundle(); ++MI) {
294 CurrCycleInstr = &*MI;
295 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
296
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
299
300 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
301 }
302
303 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
304 // include the bundled MI directly after, only add a maximum of
305 // (MaxLookAhead - 1) noops to EmittedInstrs.
306 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
307 EmittedInstrs.push_front(nullptr);
308
309 EmittedInstrs.push_front(CurrCycleInstr);
310 EmittedInstrs.resize(MaxLookAhead);
311 }
312 CurrCycleInstr = nullptr;
313}
314
315void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
316 assert(IsHazardRecognizerMode);
317
318 unsigned NumPreNoops = PreEmitNoops(MI);
319 EmitNoops(NumPreNoops);
320 if (MI->isInsideBundle())
321 insertNoopsInBundle(MI, TII, NumPreNoops);
322 else
323 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
324 NumPreNoops);
326 AdvanceCycle();
327}
328
330 IsHazardRecognizerMode = true;
331 CurrCycleInstr = MI;
332 unsigned W = PreEmitNoopsCommon(MI);
333 fixHazards(MI);
334 CurrCycleInstr = nullptr;
335 return std::max(W, NopPadding.getValue());
336}
337
341
343 if (MI->isBundle())
344 return 0;
345
346 int WaitStates = 0;
347
349 return std::max(WaitStates, checkSMRDHazards(MI));
350
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
353
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
355
356 if (ST.hasNoDataDepHazard())
357 return WaitStates;
358
360 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
361
363 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
364
366 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
367
368 if (isDivFMas(MI->getOpcode()))
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
370
371 if (isRWLane(MI->getOpcode()))
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
373
376 checkMAIVALUHazards(MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
378
379 if (MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(MI));
381
382 if (isSGetReg(MI->getOpcode()))
383 return std::max(WaitStates, checkGetRegHazards(MI));
384
385 if (isSSetReg(MI->getOpcode()))
386 return std::max(WaitStates, checkSetRegHazards(MI));
387
388 if (isRFE(MI->getOpcode()))
389 return std::max(WaitStates, checkRFEHazards(MI));
390
391 if ((ST.hasReadM0MovRelInterpHazard() &&
392 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
395 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
396 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(MI));
400
402 return std::max(WaitStates, checkMAIHazards(MI));
403
405 return std::max(WaitStates, checkMAILdStHazards(MI));
406
407 if (ST.hasGFX950Insts() && isPermlane(*MI))
408 return std::max(WaitStates, checkPermlaneHazards(MI));
409
410 return WaitStates;
411}
412
414 EmittedInstrs.push_front(nullptr);
415}
416
418 // When the scheduler detects a stall, it will call AdvanceCycle() without
419 // emitting any instructions.
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(nullptr);
422 return;
423 }
424
425 if (CurrCycleInstr->isBundle()) {
426 processBundle();
427 return;
428 }
429
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr = nullptr;
433 return;
434 }
435
436 // Keep track of emitted instructions
437 EmittedInstrs.push_front(CurrCycleInstr);
438
439 // Add a nullptr for each additional wait state after the first. Make sure
440 // not to add more than getMaxLookAhead() items to the list, since we
441 // truncate the list to that size right after this loop.
442 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
443 i < e; ++i) {
444 EmittedInstrs.push_front(nullptr);
445 }
446
447 // getMaxLookahead() is the largest number of wait states we will ever need
448 // to insert, so there is no point in keeping track of more than that many
449 // wait states.
450 EmittedInstrs.resize(getMaxLookAhead());
451
452 CurrCycleInstr = nullptr;
453}
454
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
458}
459
460//===----------------------------------------------------------------------===//
461// Helper Functions
462//===----------------------------------------------------------------------===//
463
465
466// Search for a hazard in a block and its predecessors.
467template <typename StateT>
468static bool
469hasHazard(StateT InitialState,
470 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
471 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
472 const MachineBasicBlock *InitialMBB,
474 struct StateMapKey {
476 unsigned Idx;
477 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
478 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
479 }
480 };
481 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
482 static inline StateMapKey getEmptyKey() {
483 return {static_cast<SmallVectorImpl<StateT> *>(
486 }
487 static inline StateMapKey getTombstoneKey() {
488 return {static_cast<SmallVectorImpl<StateT> *>(
491 }
492 static unsigned getHashValue(const StateMapKey &Key) {
493 return StateT::getHashValue((*Key.States)[Key.Idx]);
494 }
495 static unsigned getHashValue(const StateT &State) {
496 return StateT::getHashValue(State);
497 }
498 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
499 const auto EKey = getEmptyKey();
500 const auto TKey = getTombstoneKey();
501 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
502 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
503 return StateMapKey::isEqual(LHS, RHS);
504 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
505 }
506 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
507 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
508 StateMapKey::isEqual(RHS, getTombstoneKey()))
509 return false;
510 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
511 }
512 };
513
516
518 const MachineBasicBlock *MBB = InitialMBB;
519 StateT State = InitialState;
520
522 unsigned WorkIdx = 0;
523 for (;;) {
524 bool Expired = false;
525 for (auto E = MBB->instr_rend(); I != E; ++I) {
526 // No need to look at parent BUNDLE instructions.
527 if (I->isBundle())
528 continue;
529
530 auto Result = IsHazard(State, *I);
531 if (Result == HazardFound)
532 return true;
533 if (Result == HazardExpired) {
534 Expired = true;
535 break;
536 }
537
538 if (I->isInlineAsm() || I->isMetaInstruction())
539 continue;
540
541 UpdateState(State, *I);
542 }
543
544 if (!Expired) {
545 unsigned StateIdx = States.size();
546 StateMapKey Key = {&States, StateIdx};
547 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
548 if (Insertion.second) {
549 States.emplace_back(State);
550 } else {
551 StateIdx = Insertion.first->second;
552 }
553 for (MachineBasicBlock *Pred : MBB->predecessors())
554 Worklist.insert(std::pair(Pred, StateIdx));
555 }
556
557 if (WorkIdx == Worklist.size())
558 break;
559
560 unsigned StateIdx;
561 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
562 State = States[StateIdx];
563 I = MBB->instr_rbegin();
564 }
565
566 return false;
567}
568
569// Returns a minimum wait states since \p I walking all predecessors.
570// Only scans until \p IsExpired does not return true.
571// Can only be run in a hazard recognizer mode.
572static int
574 const MachineBasicBlock *MBB,
576 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
580 for (auto E = MBB->instr_rend(); I != E; ++I) {
581 // Don't add WaitStates for parent BUNDLE instructions.
582 if (I->isBundle())
583 continue;
584
585 if (IsHazard(*I))
586 return WaitStates;
587
588 if (I->isInlineAsm())
589 continue;
590
591 WaitStates += GetNumWaitStates(*I);
592
593 if (IsExpired(*I, WaitStates))
594 return std::numeric_limits<int>::max();
595 }
596
597 int MinWaitStates = std::numeric_limits<int>::max();
598 for (MachineBasicBlock *Pred : MBB->predecessors()) {
599 if (!Visited.insert(Pred).second)
600 continue;
601
602 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
603 IsExpired, Visited, GetNumWaitStates);
604
605 MinWaitStates = std::min(MinWaitStates, W);
606 }
607
608 return MinWaitStates;
609}
610
611static int
613 const MachineInstr *MI,
618 return getWaitStatesSince(IsHazard, MI->getParent(),
619 std::next(MI->getReverseIterator()), 0, IsExpired,
620 Visited, GetNumWaitStates);
621}
622
623int GCNHazardRecognizer::getWaitStatesSince(
624 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
625 if (IsHazardRecognizerMode) {
626 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
627 return WaitStates >= Limit;
628 };
629 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
630 GetNumWaitStates);
631 }
632
633 int WaitStates = 0;
634 for (MachineInstr *MI : EmittedInstrs) {
635 if (MI) {
636 if (IsHazard(*MI))
637 return WaitStates;
638
639 if (MI->isInlineAsm())
640 continue;
641 }
642 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
643
644 if (WaitStates >= Limit)
645 break;
646 }
647 return std::numeric_limits<int>::max();
648}
649
650int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
651 int Limit) const {
652 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
653}
654
655int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
656 IsHazardFn IsHazardDef,
657 int Limit) const {
658 const SIRegisterInfo *TRI = ST.getRegisterInfo();
659
660 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
661 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
662 };
663
664 return getWaitStatesSince(IsHazardFn, Limit);
665}
666
667int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
668 int Limit) const {
669 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
670 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
671 };
672
673 return getWaitStatesSince(IsHazardFn, Limit);
674}
675
676//===----------------------------------------------------------------------===//
677// No-op Hazard Detection
678//===----------------------------------------------------------------------===//
679
680static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
681 MCRegister Reg) {
682 for (MCRegUnit Unit : TRI.regunits(Reg))
683 BV.set(static_cast<unsigned>(Unit));
684}
685
686static void addRegsToSet(const SIRegisterInfo &TRI,
688 BitVector &DefSet, BitVector &UseSet) {
689 for (const MachineOperand &Op : Ops) {
690 if (Op.isReg())
691 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
692 }
693}
694
695void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
696 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
697}
698
700 return !SIInstrInfo::isSMRD(*MI);
701}
702
704 return !SIInstrInfo::isVMEM(*MI);
705}
706
707int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
708 // SMEM soft clause are only present on VI+, and only matter if xnack is
709 // enabled.
710 if (!ST.isXNACKEnabled())
711 return 0;
712
713 bool IsSMRD = TII.isSMRD(*MEM);
714
715 resetClause();
716
717 // A soft-clause is any group of consecutive SMEM instructions. The
718 // instructions in this group may return out of order and/or may be
719 // replayed (i.e. the same instruction issued more than once).
720 //
721 // In order to handle these situations correctly we need to make sure that
722 // when a clause has more than one instruction, no instruction in the clause
723 // writes to a register that is read by another instruction in the clause
724 // (including itself). If we encounter this situation, we need to break the
725 // clause by inserting a non SMEM instruction.
726
727 for (MachineInstr *MI : EmittedInstrs) {
728 // When we hit a non-SMEM instruction then we have passed the start of the
729 // clause and we can stop.
730 if (!MI)
731 break;
732
734 break;
735
736 addClauseInst(*MI);
737 }
738
739 if (ClauseDefs.none())
740 return 0;
741
742 // We need to make sure not to put loads and stores in the same clause if they
743 // use the same address. For now, just start a new clause whenever we see a
744 // store.
745 if (MEM->mayStore())
746 return 1;
747
748 addClauseInst(*MEM);
749
750 // If the set of defs and uses intersect then we cannot add this instruction
751 // to the clause, so we have a hazard.
752 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
753}
754
755int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
756 int WaitStatesNeeded = 0;
757
758 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
759
760 // This SMRD hazard only affects SI.
761 if (!ST.hasSMRDReadVALUDefHazard())
762 return WaitStatesNeeded;
763
764 // A read of an SGPR by SMRD instruction requires 4 wait states when the
765 // SGPR was written by a VALU instruction.
766 int SmrdSgprWaitStates = 4;
767 auto IsHazardDefFn = [this](const MachineInstr &MI) {
768 return TII.isVALU(MI);
769 };
770 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
771 return TII.isSALU(MI);
772 };
773
774 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
775
776 for (const MachineOperand &Use : SMRD->uses()) {
777 if (!Use.isReg())
778 continue;
779 int WaitStatesNeededForUse =
780 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
781 SmrdSgprWaitStates);
782 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
783
784 // This fixes what appears to be undocumented hardware behavior in SI where
785 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
786 // needs some number of nops in between. We don't know how many we need, but
787 // let's use 4. This wasn't discovered before probably because the only
788 // case when this happens is when we expand a 64-bit pointer into a full
789 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
790 // probably never encountered in the closed-source land.
791 if (IsBufferSMRD) {
792 int WaitStatesNeededForUse =
793 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
794 IsBufferHazardDefFn,
795 SmrdSgprWaitStates);
796 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
797 }
798 }
799
800 return WaitStatesNeeded;
801}
802
803int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
804 if (!ST.hasVMEMReadSGPRVALUDefHazard())
805 return 0;
806
807 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
808
809 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
810 // SGPR was written by a VALU Instruction.
811 const int VmemSgprWaitStates = 5;
812 auto IsHazardDefFn = [this](const MachineInstr &MI) {
813 return TII.isVALU(MI);
814 };
815 for (const MachineOperand &Use : VMEM->uses()) {
816 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
817 continue;
818
819 int WaitStatesNeededForUse =
820 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
821 VmemSgprWaitStates);
822 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
823 }
824 return WaitStatesNeeded;
825}
826
827int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
828 const SIRegisterInfo *TRI = ST.getRegisterInfo();
829 const SIInstrInfo *TII = ST.getInstrInfo();
830
831 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
832 int DppVgprWaitStates = 2;
833 int DppExecWaitStates = 5;
834 int WaitStatesNeeded = 0;
835 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
836 return TII->isVALU(MI);
837 };
838
839 for (const MachineOperand &Use : DPP->uses()) {
840 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
841 continue;
842 int WaitStatesNeededForUse =
843 DppVgprWaitStates - getWaitStatesSinceDef(
844 Use.getReg(),
845 [](const MachineInstr &) { return true; },
846 DppVgprWaitStates);
847 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
848 }
849
850 WaitStatesNeeded = std::max(
851 WaitStatesNeeded,
852 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
853 DppExecWaitStates));
854
855 return WaitStatesNeeded;
856}
857
858int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
859 const SIInstrInfo *TII = ST.getInstrInfo();
860
861 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
862 // instruction.
863 const int DivFMasWaitStates = 4;
864 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
865 return TII->isVALU(MI);
866 };
867 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
868 DivFMasWaitStates);
869
870 return DivFMasWaitStates - WaitStatesNeeded;
871}
872
873int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
874 const SIInstrInfo *TII = ST.getInstrInfo();
875 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
876
877 const int GetRegWaitStates = 2;
878 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
879 return GetRegHWReg == getHWReg(TII, MI);
880 };
881 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
882
883 return GetRegWaitStates - WaitStatesNeeded;
884}
885
886int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
887 const SIInstrInfo *TII = ST.getInstrInfo();
888 unsigned HWReg = getHWReg(TII, *SetRegInstr);
889
890 const int SetRegWaitStates = ST.getSetRegWaitStates();
891 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
892 return HWReg == getHWReg(TII, MI);
893 };
894 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
895 return SetRegWaitStates - WaitStatesNeeded;
896}
897
898int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
899 if (!MI.mayStore())
900 return -1;
901
902 const SIInstrInfo *TII = ST.getInstrInfo();
903 unsigned Opcode = MI.getOpcode();
904 const MCInstrDesc &Desc = MI.getDesc();
905
906 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
907 int VDataRCID = -1;
908 if (VDataIdx != -1)
909 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
910
911 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
912 // There is no hazard if the instruction does not use vector regs
913 // (like wbinvl1)
914 if (VDataIdx == -1)
915 return -1;
916 // For MUBUF/MTBUF instructions this hazard only exists if the
917 // instruction is not using a register in the soffset field.
918 const MachineOperand *SOffset =
919 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
920 // If we have no soffset operand, then assume this field has been
921 // hardcoded to zero.
922 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
923 (!SOffset || !SOffset->isReg()))
924 return VDataIdx;
925 }
926
927 // MIMG instructions create a hazard if they don't use a 256-bit T# and
928 // the store size is greater than 8 bytes and they have more than two bits
929 // of their dmask set.
930 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
931 if (TII->isMIMG(MI)) {
932 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
933 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
934 Desc.operands()[SRsrcIdx])) == 256);
935 (void)SRsrcIdx;
936 }
937
938 if (TII->isFLAT(MI)) {
939 // There is no hazard if the instruction does not use vector regs
940 if (VDataIdx == -1)
941 return -1;
942
943 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
944 return VDataIdx;
945 }
946
947 return -1;
948}
949
950int GCNHazardRecognizer::checkVALUHazardsHelper(
951 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
952 // Helper to check for the hazard where VMEM instructions that store more than
953 // 8 bytes can have there store data over written by the next instruction.
954 const SIRegisterInfo *TRI = ST.getRegisterInfo();
955
956 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
957 int WaitStatesNeeded = 0;
958
959 if (!TRI->isVectorRegister(MRI, Def.getReg()))
960 return WaitStatesNeeded;
961 Register Reg = Def.getReg();
962 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
963 int DataIdx = createsVALUHazard(MI);
964 return DataIdx >= 0 &&
965 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
966 };
967
968 int WaitStatesNeededForDef =
969 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
970 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
971
972 return WaitStatesNeeded;
973}
974
975/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
976/// pack the computed value into correct bit position of the dest register. This
977/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
978/// dst_sel that is not aligned to the register. This function analayzes the \p
979/// MI and \returns an operand with dst forwarding issue, or nullptr if
980/// none exists.
981static const MachineOperand *
984 return nullptr;
985
986 const SIInstrInfo *TII = ST.getInstrInfo();
987
988 unsigned Opcode = MI.getOpcode();
989
990 // There are three different types of instructions
991 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
992 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
993 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
994 // op_sel[3:2]
995 // != 0
996 if (SIInstrInfo::isSDWA(MI)) {
997 // Type 1: SDWA with dst_sel != DWORD
998 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
999 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1000 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1001 }
1002
1003 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1004 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1005 // Type 2: VOP3 which write the hi bits
1006 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1008 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1009
1010 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1011 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1012 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1014 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1015 }
1016
1017 // Special case: nop is required for all the opsel values for fp4 sr variant
1018 // cvt scale instructions
1019 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1020 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1021
1022 return nullptr;
1023}
1024
1025/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1026/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1027/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1029 const MachineOperand *Dst,
1030 const SIRegisterInfo *TRI) {
1031 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1032 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1033 // and we must account for that hazard.
1034 // We also must account for WAW hazards. In particular, WAW with dest
1035 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1036 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1037 // check for ECC. Without accounting for this hazard, the ECC will be
1038 // wrong.
1039 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1040 // complete zeroesHigh16BitsOfDest)
1041 for (auto &Operand : VALU->operands()) {
1042 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1043 return true;
1044 }
1045 }
1046 return false;
1047}
1048
1049int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1050 int WaitStatesNeeded = 0;
1051
1052 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1053 const int TransDefWaitstates = 1;
1054
1055 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1057 return false;
1058 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1059 const SIInstrInfo *TII = ST.getInstrInfo();
1060 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1061
1062 for (const MachineOperand &Use : VALU->explicit_uses()) {
1063 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1064 return true;
1065 }
1066
1067 return false;
1068 };
1069
1070 int WaitStatesNeededForDef =
1071 TransDefWaitstates -
1072 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1073 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1074 }
1075
1076 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1077 const int Shift16DefWaitstates = 1;
1078
1079 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1080 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1081 const MachineOperand *ForwardedDst =
1082 getDstSelForwardingOperand(ProducerMI, ST);
1083 if (ForwardedDst) {
1084 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1085 }
1086
1087 if (ProducerMI.isInlineAsm()) {
1088 // Assume inline asm has dst forwarding hazard
1089 for (auto &Def : ProducerMI.all_defs()) {
1090 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1091 return true;
1092 }
1093 }
1094
1095 return false;
1096 };
1097
1098 int WaitStatesNeededForDef =
1099 Shift16DefWaitstates -
1100 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1102 }
1103
1104 if (ST.hasVDecCoExecHazard()) {
1105 const int VALUWriteSGPRVALUReadWaitstates = 2;
1106 const int VALUWriteEXECRWLane = 4;
1107 const int VALUWriteVGPRReadlaneRead = 1;
1108
1109 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1110 const MachineRegisterInfo &MRI = MF.getRegInfo();
1112 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1113 if (!SIInstrInfo::isVALU(MI))
1114 return false;
1115 return MI.modifiesRegister(UseReg, TRI);
1116 };
1117
1118 for (const MachineOperand &Use : VALU->explicit_uses()) {
1119 if (!Use.isReg())
1120 continue;
1121
1122 UseReg = Use.getReg();
1123 if (TRI->isSGPRReg(MRI, UseReg)) {
1124 int WaitStatesNeededForDef =
1125 VALUWriteSGPRVALUReadWaitstates -
1126 getWaitStatesSince(IsVALUDefSGPRFn,
1127 VALUWriteSGPRVALUReadWaitstates);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1129 }
1130 }
1131
1132 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1133 UseReg = AMDGPU::VCC;
1134 int WaitStatesNeededForDef =
1135 VALUWriteSGPRVALUReadWaitstates -
1136 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1137 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1138 }
1139
1140 switch (VALU->getOpcode()) {
1141 case AMDGPU::V_READLANE_B32:
1142 case AMDGPU::V_READFIRSTLANE_B32: {
1143 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1144 UseReg = Src->getReg();
1145 int WaitStatesNeededForDef =
1146 VALUWriteVGPRReadlaneRead -
1147 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1148 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1149 }
1150 [[fallthrough]];
1151 case AMDGPU::V_WRITELANE_B32: {
1152 UseReg = AMDGPU::EXEC;
1153 int WaitStatesNeededForDef =
1154 VALUWriteEXECRWLane -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1157 break;
1158 }
1159 default:
1160 break;
1161 }
1162 }
1163
1164 // This checks for the hazard where VMEM instructions that store more than
1165 // 8 bytes can have there store data over written by the next instruction.
1166 if (!ST.has12DWordStoreHazard())
1167 return WaitStatesNeeded;
1168
1169 const MachineRegisterInfo &MRI = MF.getRegInfo();
1170
1171 for (const MachineOperand &Def : VALU->defs()) {
1172 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1173 }
1174
1175 return WaitStatesNeeded;
1176}
1177
1178int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1179 // This checks for hazards associated with inline asm statements.
1180 // Since inline asms can contain just about anything, we use this
1181 // to call/leverage other check*Hazard routines. Note that
1182 // this function doesn't attempt to address all possible inline asm
1183 // hazards (good luck), but is a collection of what has been
1184 // problematic thus far.
1185
1186 // see checkVALUHazards()
1187 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1188 !ST.hasCvtScaleForwardingHazard())
1189 return 0;
1190
1191 const MachineRegisterInfo &MRI = MF.getRegInfo();
1192 int WaitStatesNeeded = 0;
1193
1194 for (const MachineOperand &Op :
1196 if (Op.isReg() && Op.isDef()) {
1197 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1198 continue;
1199
1200 if (ST.has12DWordStoreHazard()) {
1201 WaitStatesNeeded =
1202 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1203 }
1204 }
1205 }
1206
1207 if (ST.hasDstSelForwardingHazard()) {
1208 const int Shift16DefWaitstates = 1;
1209
1210 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1211 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1212 // Assume inline asm reads the dst
1213 if (Dst)
1214 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1215 IA->readsRegister(Dst->getReg(), &TRI);
1216
1217 if (ProducerMI.isInlineAsm()) {
1218 // If MI is inline asm, assume it has dst forwarding hazard
1219 for (auto &Def : ProducerMI.all_defs()) {
1220 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1221 IA->readsRegister(Def.getReg(), &TRI)) {
1222 return true;
1223 }
1224 }
1225 }
1226
1227 return false;
1228 };
1229
1230 int WaitStatesNeededForDef =
1231 Shift16DefWaitstates -
1232 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1233 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1234 }
1235
1236 return WaitStatesNeeded;
1237}
1238
1239int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1240 const SIInstrInfo *TII = ST.getInstrInfo();
1241 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1242 const MachineRegisterInfo &MRI = MF.getRegInfo();
1243
1244 const MachineOperand *LaneSelectOp =
1245 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1246
1247 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1248 return 0;
1249
1250 Register LaneSelectReg = LaneSelectOp->getReg();
1251 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1252
1253 const int RWLaneWaitStates = 4;
1254 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1255 RWLaneWaitStates);
1256 return RWLaneWaitStates - WaitStatesSince;
1257}
1258
1259int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1260 if (!ST.hasRFEHazards())
1261 return 0;
1262
1263 const SIInstrInfo *TII = ST.getInstrInfo();
1264
1265 const int RFEWaitStates = 1;
1266
1267 auto IsHazardFn = [TII](const MachineInstr &MI) {
1268 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1269 };
1270 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1271 return RFEWaitStates - WaitStatesNeeded;
1272}
1273
1274int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1275 const SIInstrInfo *TII = ST.getInstrInfo();
1276 const int ReadM0WaitStates = 1;
1277 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1278 return ReadM0WaitStates -
1279 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1280}
1281
1282void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1284 int WaitStatesNeeded, bool IsHoisting) {
1285 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1286 for (int I = 0; I < WaitStatesNeeded; ++I)
1287 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1288}
1289
1290void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1291 fixVMEMtoScalarWriteHazards(MI);
1292 fixVcmpxPermlaneHazards(MI);
1293 fixSMEMtoVectorWriteHazards(MI);
1294 fixVcmpxExecWARHazard(MI);
1295 fixLdsBranchVmemWARHazard(MI);
1296 if (ST.hasLdsDirect()) {
1297 fixLdsDirectVALUHazard(MI);
1298 fixLdsDirectVMEMHazard(MI);
1299 }
1300 fixVALUPartialForwardingHazard(MI);
1301 fixVALUTransUseHazard(MI);
1302 fixVALUTransCoexecutionHazards(MI);
1303 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1304 fixWMMACoexecutionHazards(MI);
1305 fixShift64HighRegBug(MI);
1306 fixVALUMaskWriteHazard(MI);
1307 fixRequiredExportPriority(MI);
1308 if (ST.requiresWaitIdleBeforeGetReg())
1309 fixGetRegWaitIdle(MI);
1310 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1311 fixDsAtomicAsyncBarrierArriveB64(MI);
1312 if (ST.hasScratchBaseForwardingHazard())
1313 fixScratchBaseForwardingHazard(MI);
1314 if (ST.setRegModeNeedsVNOPs())
1315 fixSetRegMode(MI);
1316}
1317
1319 const MachineInstr &MI) {
1320 return (TII.isVOPC(MI) ||
1321 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1322 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1323}
1324
1325bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1326 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1327 return false;
1328
1329 const SIInstrInfo *TII = ST.getInstrInfo();
1330 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1331 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1332 return isVCmpXWritesExec(*TII, *TRI, MI);
1333 };
1334
1335 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1336 unsigned Opc = MI.getOpcode();
1337 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1338 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1339 };
1340
1341 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1342 std::numeric_limits<int>::max())
1343 return false;
1344
1345 // V_NOP will be discarded by SQ.
1346 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1347 // which is always a VGPR and available.
1348 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1349 Register Reg = Src0->getReg();
1350 bool IsUndef = Src0->isUndef();
1351 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1352 TII->get(AMDGPU::V_MOV_B32_e32))
1355
1356 return true;
1357}
1358
1359bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1360 if (!ST.hasVMEMtoScalarWriteHazard())
1361 return false;
1362 assert(!ST.hasExtendedWaitCounts());
1363
1365 return false;
1366
1367 if (MI->getNumDefs() == 0)
1368 return false;
1369
1370 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1371
1372 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1374 return false;
1375
1376 for (const MachineOperand &Def : MI->defs()) {
1377 const MachineOperand *Op =
1378 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1379 if (!Op)
1380 continue;
1381 return true;
1382 }
1383 return false;
1384 };
1385
1386 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1387 return SIInstrInfo::isVALU(MI) ||
1388 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1389 !MI.getOperand(0).getImm()) ||
1390 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1391 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1392 };
1393
1394 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1395 std::numeric_limits<int>::max())
1396 return false;
1397
1398 const SIInstrInfo *TII = ST.getInstrInfo();
1399 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1400 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1402 return true;
1403}
1404
1405bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1406 if (!ST.hasSMEMtoVectorWriteHazard())
1407 return false;
1408 assert(!ST.hasExtendedWaitCounts());
1409
1410 if (!SIInstrInfo::isVALU(*MI))
1411 return false;
1412
1413 AMDGPU::OpName SDSTName;
1414 switch (MI->getOpcode()) {
1415 case AMDGPU::V_READLANE_B32:
1416 case AMDGPU::V_READFIRSTLANE_B32:
1417 SDSTName = AMDGPU::OpName::vdst;
1418 break;
1419 default:
1420 SDSTName = AMDGPU::OpName::sdst;
1421 break;
1422 }
1423
1424 const SIInstrInfo *TII = ST.getInstrInfo();
1425 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1426 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1427 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1428 if (!SDST) {
1429 for (const auto &MO : MI->implicit_operands()) {
1430 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1431 SDST = &MO;
1432 break;
1433 }
1434 }
1435 }
1436
1437 if (!SDST)
1438 return false;
1439
1440 const Register SDSTReg = SDST->getReg();
1441 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1442 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1443 };
1444
1445 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1446 if (TII->isSALU(MI)) {
1447 switch (MI.getOpcode()) {
1448 case AMDGPU::S_SETVSKIP:
1449 case AMDGPU::S_VERSION:
1450 case AMDGPU::S_WAITCNT_VSCNT:
1451 case AMDGPU::S_WAITCNT_VMCNT:
1452 case AMDGPU::S_WAITCNT_EXPCNT:
1453 // These instructions cannot not mitigate the hazard.
1454 return false;
1455 case AMDGPU::S_WAITCNT_LGKMCNT:
1456 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1457 return (MI.getOperand(1).getImm() == 0) &&
1458 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1459 case AMDGPU::S_WAITCNT: {
1460 const int64_t Imm = MI.getOperand(0).getImm();
1461 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1462 // DsCnt corresponds to LGKMCnt here.
1463 return Decoded.get(AMDGPU::DS_CNT) == 0;
1464 }
1465 default:
1466 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1467 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1468 "unexpected wait count instruction");
1469 // SOPP instructions cannot mitigate the hazard.
1470 if (TII->isSOPP(MI))
1471 return false;
1472 // At this point the SALU can be assumed to mitigate the hazard
1473 // because either:
1474 // (a) it is independent of the at risk SMEM (breaking chain),
1475 // or
1476 // (b) it is dependent on the SMEM, in which case an appropriate
1477 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1478 // SMEM instruction.
1479 return true;
1480 }
1481 }
1482 return false;
1483 };
1484
1485 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1486 std::numeric_limits<int>::max())
1487 return false;
1488
1489 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1490 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1491 .addImm(0);
1492 return true;
1493}
1494
1495bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1496 if (!ST.hasVcmpxExecWARHazard())
1497 return false;
1498 assert(!ST.hasExtendedWaitCounts());
1499
1500 if (!SIInstrInfo::isVALU(*MI))
1501 return false;
1502
1503 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1504 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1505 return false;
1506
1507 auto IsHazardFn = [TRI](const MachineInstr &I) {
1509 return false;
1510 return I.readsRegister(AMDGPU::EXEC, TRI);
1511 };
1512
1513 const SIInstrInfo *TII = ST.getInstrInfo();
1514 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1515 if (SIInstrInfo::isVALU(MI)) {
1516 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1517 return true;
1518 for (auto MO : MI.implicit_operands())
1519 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1520 return true;
1521 }
1522 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1523 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1524 return true;
1525 return false;
1526 };
1527
1528 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1529 std::numeric_limits<int>::max())
1530 return false;
1531
1532 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1533 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1535 return true;
1536}
1537
1539 const GCNSubtarget &ST) {
1540 if (!ST.hasLdsBranchVmemWARHazard())
1541 return false;
1542
1543 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1544 // instructions need to appear in the same function.
1545 bool HasLds = false;
1546 bool HasVmem = false;
1547 for (auto &MBB : MF) {
1548 for (auto &MI : MBB) {
1550 HasVmem |= SIInstrInfo::isVMEM(MI);
1551 if (HasLds && HasVmem)
1552 return true;
1553 }
1554 }
1555 return false;
1556}
1557
1559 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1560 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1561 !I.getOperand(1).getImm();
1562}
1563
1564bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1565 if (!RunLdsBranchVmemWARHazardFixup)
1566 return false;
1567
1568 assert(ST.hasLdsBranchVmemWARHazard());
1569 assert(!ST.hasExtendedWaitCounts());
1570
1571 auto IsHazardInst = [](const MachineInstr &MI) {
1573 return 1;
1575 return 2;
1576 return 0;
1577 };
1578
1579 auto InstType = IsHazardInst(*MI);
1580 if (!InstType)
1581 return false;
1582
1583 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1584 return IsHazardInst(I) || isStoreCountWaitZero(I);
1585 };
1586
1587 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1588 if (!I.isBranch())
1589 return false;
1590
1591 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1592 auto InstType2 = IsHazardInst(I);
1593 return InstType2 && InstType != InstType2;
1594 };
1595
1596 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1597 auto InstType2 = IsHazardInst(I);
1598 if (InstType == InstType2)
1599 return true;
1600
1601 return isStoreCountWaitZero(I);
1602 };
1603
1604 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1605 std::numeric_limits<int>::max();
1606 };
1607
1608 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1609 std::numeric_limits<int>::max())
1610 return false;
1611
1612 const SIInstrInfo *TII = ST.getInstrInfo();
1613 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1614 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1615 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1616 .addImm(0);
1617
1618 return true;
1619}
1620
1621bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1623 return false;
1624
1625 const int NoHazardWaitStates = 15;
1626 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1627 const Register VDSTReg = VDST->getReg();
1628
1629 bool VisitedTrans = false;
1630 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1631 if (!SIInstrInfo::isVALU(I))
1632 return false;
1633 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1634 // Cover both WAR and WAW
1635 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1636 };
1637 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1638 if (WaitStates >= NoHazardWaitStates)
1639 return true;
1640 // Instructions which cause va_vdst==0 expire hazard
1643 };
1644 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1645 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1646 };
1647
1648 DenseSet<const MachineBasicBlock *> Visited;
1649 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1650 std::next(MI->getReverseIterator()), 0,
1651 IsExpiredFn, Visited, GetWaitStatesFn);
1652
1653 // Transcendentals can execute in parallel to other VALUs.
1654 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1655 if (VisitedTrans)
1656 Count = 0;
1657
1658 MachineOperand *WaitVdstOp =
1659 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1660 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1661
1662 return true;
1663}
1664
1665bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1667 return false;
1668
1669 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1670 const Register VDSTReg = VDST->getReg();
1671
1672 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1674 return false;
1675 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1676 };
1677 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1678 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1679 // according to the type of VMEM instruction.
1680 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1682 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1683 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1684 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1685 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1686 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1687 };
1688
1689 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1690 std::numeric_limits<int>::max())
1691 return false;
1692
1693 if (LdsdirCanWait) {
1694 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1695 } else {
1696 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1697 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1699 }
1700
1701 return true;
1702}
1703
1704bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1705 if (!ST.hasVALUPartialForwardingHazard())
1706 return false;
1707 assert(!ST.hasExtendedWaitCounts());
1708
1709 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1710 return false;
1711
1712 SmallSetVector<Register, 4> SrcVGPRs;
1713
1714 for (const MachineOperand &Use : MI->explicit_uses()) {
1715 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1716 SrcVGPRs.insert(Use.getReg());
1717 }
1718
1719 // Only applies with >= 2 unique VGPR sources
1720 if (SrcVGPRs.size() <= 1)
1721 return false;
1722
1723 // Look for the following pattern:
1724 // Va <- VALU [PreExecPos]
1725 // intv1
1726 // Exec <- SALU [ExecPos]
1727 // intv2
1728 // Vb <- VALU [PostExecPos]
1729 // intv3
1730 // MI Va, Vb (WaitState = 0)
1731 //
1732 // Where:
1733 // intv1 + intv2 <= 2 VALUs
1734 // intv3 <= 4 VALUs
1735 //
1736 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1737
1738 const int Intv1plus2MaxVALUs = 2;
1739 const int Intv3MaxVALUs = 4;
1740 const int IntvMaxVALUs = 6;
1741 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1742
1743 struct StateType {
1744 SmallDenseMap<Register, int, 4> DefPos;
1745 int ExecPos = std::numeric_limits<int>::max();
1746 int VALUs = 0;
1747
1748 static unsigned getHashValue(const StateType &State) {
1749 return hash_combine(State.ExecPos, State.VALUs,
1750 hash_combine_range(State.DefPos));
1751 }
1752 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1753 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1754 LHS.VALUs == RHS.VALUs;
1755 }
1756 };
1757
1758 StateType State;
1759
1760 // This overloads expiry testing with all the hazard detection
1761 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1762 // Too many VALU states have passed
1763 if (State.VALUs > NoHazardVALUWaitStates)
1764 return HazardExpired;
1765
1766 // Instructions which cause va_vdst==0 expire hazard
1769 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1770 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1771 return HazardExpired;
1772
1773 // Track registers writes
1774 bool Changed = false;
1775 if (SIInstrInfo::isVALU(I)) {
1776 for (Register Src : SrcVGPRs) {
1777 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1778 State.DefPos[Src] = State.VALUs;
1779 Changed = true;
1780 }
1781 }
1782 } else if (SIInstrInfo::isSALU(I)) {
1783 if (State.ExecPos == std::numeric_limits<int>::max()) {
1784 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1785 State.ExecPos = State.VALUs;
1786 Changed = true;
1787 }
1788 }
1789 }
1790
1791 // Early expiration: too many VALUs in intv3
1792 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1793 return HazardExpired;
1794
1795 // Only evaluate state if something changed
1796 if (!Changed)
1797 return NoHazardFound;
1798
1799 // Determine positions of VALUs pre/post exec change
1800 if (State.ExecPos == std::numeric_limits<int>::max())
1801 return NoHazardFound;
1802
1803 int PreExecPos = std::numeric_limits<int>::max();
1804 int PostExecPos = std::numeric_limits<int>::max();
1805
1806 for (auto Entry : State.DefPos) {
1807 int DefVALUs = Entry.second;
1808 if (DefVALUs != std::numeric_limits<int>::max()) {
1809 if (DefVALUs >= State.ExecPos)
1810 PreExecPos = std::min(PreExecPos, DefVALUs);
1811 else
1812 PostExecPos = std::min(PostExecPos, DefVALUs);
1813 }
1814 }
1815
1816 // Need a VALUs post exec change
1817 if (PostExecPos == std::numeric_limits<int>::max())
1818 return NoHazardFound;
1819
1820 // Too many VALUs in intv3?
1821 int Intv3VALUs = PostExecPos;
1822 if (Intv3VALUs > Intv3MaxVALUs)
1823 return HazardExpired;
1824
1825 // Too many VALUs in intv2?
1826 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1827 if (Intv2VALUs > Intv1plus2MaxVALUs)
1828 return HazardExpired;
1829
1830 // Need a VALUs pre exec change
1831 if (PreExecPos == std::numeric_limits<int>::max())
1832 return NoHazardFound;
1833
1834 // Too many VALUs in intv1?
1835 int Intv1VALUs = PreExecPos - State.ExecPos;
1836 if (Intv1VALUs > Intv1plus2MaxVALUs)
1837 return HazardExpired;
1838
1839 // Too many VALUs in intv1 + intv2
1840 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1841 return HazardExpired;
1842
1843 return HazardFound;
1844 };
1845 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1847 State.VALUs += 1;
1848 };
1849
1850 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1851 std::next(MI->getReverseIterator())))
1852 return false;
1853
1854 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1855 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1857
1858 return true;
1859}
1860
1861bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1862 if (!ST.hasVALUTransUseHazard())
1863 return false;
1864 assert(!ST.hasExtendedWaitCounts());
1865
1866 if (!SIInstrInfo::isVALU(*MI))
1867 return false;
1868
1869 SmallSet<Register, 4> SrcVGPRs;
1870
1871 for (const MachineOperand &Use : MI->explicit_uses()) {
1872 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1873 SrcVGPRs.insert(Use.getReg());
1874 }
1875
1876 // Look for the following pattern:
1877 // Va <- TRANS VALU
1878 // intv
1879 // MI Va (WaitState = 0)
1880 //
1881 // Where:
1882 // intv <= 5 VALUs / 1 TRANS
1883 //
1884 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1885
1886 const int IntvMaxVALUs = 5;
1887 const int IntvMaxTRANS = 1;
1888
1889 struct StateType {
1890 int VALUs = 0;
1891 int TRANS = 0;
1892
1893 static unsigned getHashValue(const StateType &State) {
1894 return hash_combine(State.VALUs, State.TRANS);
1895 }
1896 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1897 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1898 }
1899 };
1900
1901 StateType State;
1902
1903 // This overloads expiry testing with all the hazard detection
1904 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1905 // Too many VALU states have passed
1906 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1907 return HazardExpired;
1908
1909 // Instructions which cause va_vdst==0 expire hazard
1912 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1913 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1914 return HazardExpired;
1915
1916 // Track registers writes
1917 if (SIInstrInfo::isTRANS(I)) {
1918 for (Register Src : SrcVGPRs) {
1919 if (I.modifiesRegister(Src, &TRI)) {
1920 return HazardFound;
1921 }
1922 }
1923 }
1924
1925 return NoHazardFound;
1926 };
1927 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1929 State.VALUs += 1;
1931 State.TRANS += 1;
1932 };
1933
1934 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1935 std::next(MI->getReverseIterator())))
1936 return false;
1937
1938 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1939 // avoided.
1940 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1941 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1943
1944 return true;
1945}
1946
1947bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1948 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1950 return false;
1951
1952 const SIInstrInfo *TII = ST.getInstrInfo();
1953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1954
1955 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1956 if (!SIInstrInfo::isTRANS(I))
1957 return false;
1958
1959 // RAW: Trans(I) writes, VALU(MI) reads.
1960 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1961 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1962 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1963 return true;
1964 }
1965
1966 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1967 if (!ValuDst || !ValuDst->isReg())
1968 return false;
1969
1970 // WAR: Trans(I) reads, VALU(MI) writes.
1971 Register ValuDef = ValuDst->getReg();
1972 for (const MachineOperand &TransUse : I.explicit_uses()) {
1973 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1974 return true;
1975 }
1976
1977 return false;
1978 };
1979
1980 auto IsExpiredFn = [](const MachineInstr &I, int) {
1981 return SIInstrInfo::isVALU(I);
1982 };
1983
1984 const int HasVALU = std::numeric_limits<int>::max();
1985 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1986 return false;
1987
1988 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1989 return true;
1990}
1991
1992bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1994 return false;
1995
1996 const SIInstrInfo *TII = ST.getInstrInfo();
1997 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1998
1999 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2001 return false;
2002
2003 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2004 // with the dest(matrix D) of the previous wmma.
2005 const Register CurSrc0Reg =
2006 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2007 const Register CurSrc1Reg =
2008 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2009
2010 const Register PrevDstReg =
2011 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2012
2013 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2014 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2015 return true;
2016 }
2017
2018 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2019 // but Index can't overlap with PrevDstReg.
2020 if (AMDGPU::isGFX12Plus(ST)) {
2021 if (SIInstrInfo::isSWMMAC(*MI)) {
2022 const Register CurIndex =
2023 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2024 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2025 return true;
2026 }
2027 return false;
2028 }
2029
2030 return false;
2031 };
2032
2033 auto IsExpiredFn = [](const MachineInstr &I, int) {
2034 return SIInstrInfo::isVALU(I);
2035 };
2036
2037 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2038 std::numeric_limits<int>::max())
2039 return false;
2040
2041 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2042
2043 return true;
2044}
2045
2050
2052 const SIInstrInfo *TII, unsigned Latency,
2053 unsigned Category) {
2054 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2055 "Handle me if the xdl wmma instruction latency changes");
2056
2057 switch (Category) {
2058 case 0: // Dense WMMA Instructions:
2059 // WMMA_*F16, WMMA_*BF16
2060 // WMMA_*FP8FP8
2061 // WMMA_*FP8BF8
2062 // WMMA_*BF8FP8
2063 // WMMA_*BF8BF8
2064 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2065 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2066
2067 case 1: // Dense WMMA Instructions:
2068 // WMMA_IU8
2069 // WMMA_IU4
2070 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2071 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2072
2073 case 2: // Dense SWMMAC Instructions
2074 // SWMMAC_*F16, SWMMAC_*BF16,
2075 // SWMMAC_*FP8FP8
2076 // SWMMAC_*BF8FP8
2077 // SWMMAC_*FP8BF8
2078 // SWMMAC_*BF8BF8
2079 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2080
2081 case 3: // Sparse WMMA Instructions:
2082 // SWMMAC_IU8
2083 // SWMMAC_IU4
2084 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2085 default:
2086 break;
2087 } // end switch.
2088
2089 return false;
2090}
2091
2092int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2093 if (!ST.hasGFX1250Insts())
2094 return 0;
2095
2096 const SIInstrInfo *TII = ST.getInstrInfo();
2097 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2098 return 0;
2099
2100 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2101 // be in between the first WMMA and the second instruction to cover the hazard
2102 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2103 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2104 // numbers, which depends on the category of the first WMMA.
2105 const int WMMAWaitStates[] = {5, 9, 3, 5};
2106 const int VALUWaitStates[] = {4, 8, 2, 4};
2107 unsigned Category = 0;
2108
2109 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2110 if (!TII->isXDLWMMA(I))
2111 return false;
2112
2113 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2114 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2115 return false;
2116
2117 return hasWMMAToWMMARegOverlap(I, *MI);
2118 };
2119
2120 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2121 if (!TII->isXDLWMMA(I))
2122 return false;
2123
2124 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2125 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2126 return false;
2127
2128 return hasWMMAToVALURegOverlap(I, *MI);
2129 };
2130
2131 int Limit = 0;
2132
2133 auto GetWaitStatesFn = [](const MachineInstr &I) {
2134 return SIInstrInfo::isVALU(I) ? 1 : 0;
2135 };
2136
2137 int WaitStatesNeeded = -1;
2138 if (TII->isXDLWMMA(*MI)) {
2139 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2140 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2141 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2142 // exists, and INT_MAX if there is no hazard. As a result, a negative
2143 // WaitStatesNeeded here means no hazard, and we will continue to search
2144 // for other categories.
2145 WaitStatesNeeded =
2146 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2147 }
2148 } else { // Must be a co-executable VALU.
2149 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2150 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2151 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2152 // exists, and INT_MAX if there is no hazard. As a result, a negative
2153 // WaitStatesNeeded here means no hazard, and we will continue to search
2154 // for other categories.
2155 WaitStatesNeeded =
2156 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2157 }
2158 }
2159
2160 return WaitStatesNeeded;
2161}
2162
2163bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2164 const MachineInstr &WMMA, const MachineInstr &MI) const {
2165 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2166 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2167 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2168
2169 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2170 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2171 return true;
2172
2174 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2175 if (TRI.regsOverlap(D0, Idx1))
2176 return true;
2177 }
2178 return false;
2179}
2180
2181bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2182 const MachineInstr &WMMA, const MachineInstr &MI) const {
2183 // WMMA writes, VALU reads.
2184 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2185 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2186 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2187 return true;
2188 }
2189
2190 // WMMA reads or writes, VALU writes.
2191 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2192 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2193 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2194
2195 if (SIInstrInfo::isSWMMAC(WMMA)) {
2196 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2197 WMMARegs.push_back(Idx0);
2198 }
2199
2200 for (const MachineOperand &ValuDef : MI.defs()) {
2201 Register VDstReg = ValuDef.getReg();
2202 for (Register WMMAReg : WMMARegs) {
2203 if (TRI.regsOverlap(VDstReg, WMMAReg))
2204 return true;
2205 }
2206 }
2207 return false;
2208}
2209
2210bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2211 const MachineInstr &MI) const {
2212 // I is the potential WMMA hazard source, MI is the instruction being checked
2213 // for hazard.
2214 if (!TII.isXDLWMMA(I))
2215 return false;
2216
2217 // Dispatch based on MI type
2218 if (TII.isXDLWMMA(MI))
2219 return hasWMMAToWMMARegOverlap(I, MI);
2221 return hasWMMAToVALURegOverlap(I, MI);
2222
2223 return false;
2224}
2225
2226bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2227 bool IncludeSubloops) {
2228 // Scan loop for any WMMA that hazards MI.
2229 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2230 for (MachineBasicBlock *MBB : L->getBlocks()) {
2231 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2232 continue;
2233 for (MachineInstr &I : *MBB) {
2234 if (&I == MI)
2235 continue;
2236 if (isCoexecutionHazardFor(I, *MI))
2237 return true;
2238 }
2239 }
2240 return false;
2241}
2242
2243bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2244 int WaitStatesNeeded) {
2245 if (!MLI)
2246 return false;
2247
2248 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2249 if (!L) {
2250 ++NumWMMAHoistingBailed;
2251 return false;
2252 }
2253
2254 // If innermost loop has WMMA hazard, we can't hoist at all
2255 if (hasWMMAHazardInLoop(L, MI)) {
2256 ++NumWMMAHoistingBailed;
2257 return false;
2258 }
2259
2260 // Find outermost loop with no internal hazard
2261 MachineLoop *TargetLoop = L;
2262 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2263 if (hasWMMAHazardInLoop(Parent, MI, false))
2264 break; // Parent has hazard in its own blocks, stop here
2265 TargetLoop = Parent; // Safe to hoist further out
2266 }
2267
2268 // Need valid preheader to insert V_NOPs
2269 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2270 if (!Preheader) {
2271 ++NumWMMAHoistingBailed;
2272 return false;
2273 }
2274
2275 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2276 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2277 << "\n");
2278
2279 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2280 /*IsHoisting=*/true);
2281 NumWMMANopsHoisted += WaitStatesNeeded;
2282 return true;
2283}
2284
2285bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2286 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2287 if (WaitStatesNeeded <= 0)
2288 return false;
2289
2290 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2291 return true;
2292
2293 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2294 return true;
2295}
2296
2297bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2298 if (!ST.hasShift64HighRegBug())
2299 return false;
2300 assert(!ST.hasExtendedWaitCounts());
2301
2302 switch (MI->getOpcode()) {
2303 default:
2304 return false;
2305 case AMDGPU::V_LSHLREV_B64_e64:
2306 case AMDGPU::V_LSHRREV_B64_e64:
2307 case AMDGPU::V_ASHRREV_I64_e64:
2308 break;
2309 }
2310
2311 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2312 if (!Amt->isReg())
2313 return false;
2314
2315 Register AmtReg = Amt->getReg();
2316 const MachineRegisterInfo &MRI = MF.getRegInfo();
2317 // Check if this is a last VGPR in the allocation block.
2318 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2319 return false;
2320
2321 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2322 return false;
2323
2324 assert(ST.needsAlignedVGPRs());
2325 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2326
2327 const DebugLoc &DL = MI->getDebugLoc();
2328 MachineBasicBlock *MBB = MI->getParent();
2329 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2330
2331 // In:
2332 //
2333 // Dst = shiftrev64 Amt, Src1
2334 //
2335 // if Dst!=Src1 then avoid the bug with:
2336 //
2337 // Dst.sub0 = Amt
2338 // Dst = shift64 Dst.sub0, Src1
2339
2340 Register DstReg = MI->getOperand(0).getReg();
2341 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2342 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2343 runOnInstruction(
2344 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2345 Amt->setReg(DstLo);
2346 Amt->setIsKill(true);
2347 return true;
2348 }
2349
2350 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2351 Register NewReg;
2352 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2353 : AMDGPU::VGPR_32RegClass) {
2354 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2355 NewReg = Reg;
2356 break;
2357 }
2358 }
2359
2360 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2361 : NewReg;
2362 Register NewAmtLo;
2363
2364 if (Overlapped)
2365 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2366
2367 // Insert a full wait count because found register might be pending a wait.
2368 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2369 .addImm(0);
2370
2371 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2372 if (Overlapped)
2373 runOnInstruction(
2374 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2375 .addDef(AmtReg - 1)
2376 .addReg(AmtReg - 1, RegState::Undef)
2377 .addReg(NewAmtLo, RegState::Undef));
2378 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2379 .addDef(AmtReg)
2380 .addReg(AmtReg, RegState::Undef)
2381 .addReg(NewAmt, RegState::Undef));
2382
2383 // Instructions emitted after the current instruction will be processed by the
2384 // parent loop of the hazard recognizer in a natural way.
2385 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2386 AmtReg)
2387 .addDef(NewAmt)
2388 .addReg(NewAmt)
2389 .addReg(AmtReg);
2390 if (Overlapped)
2391 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2392 AmtReg - 1)
2393 .addDef(NewAmtLo)
2394 .addReg(NewAmtLo)
2395 .addReg(AmtReg - 1);
2396
2397 // Re-running hazard recognizer on the modified instruction is not necessary,
2398 // inserted V_SWAP_B32 has already both read and write new registers so
2399 // hazards related to these register has already been handled.
2400 Amt->setReg(NewAmt);
2401 Amt->setIsKill(false);
2402 // We do not update liveness, so verifier may see it as undef.
2403 Amt->setIsUndef();
2404 if (Overlapped) {
2405 MI->getOperand(0).setReg(NewReg);
2406 Src1->setReg(NewReg);
2407 Src1->setIsKill(false);
2408 Src1->setIsUndef();
2409 }
2410
2411 return true;
2412}
2413
2414int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2415 int NSAtoVMEMWaitStates = 1;
2416
2417 if (!ST.hasNSAtoVMEMBug())
2418 return 0;
2419
2421 return 0;
2422
2423 const SIInstrInfo *TII = ST.getInstrInfo();
2424 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2425 if (!Offset || (Offset->getImm() & 6) == 0)
2426 return 0;
2427
2428 auto IsHazardFn = [TII](const MachineInstr &I) {
2429 if (!SIInstrInfo::isMIMG(I))
2430 return false;
2431 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2432 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2433 TII->getInstSizeInBytes(I) >= 16;
2434 };
2435
2436 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2437}
2438
2439int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2440 MachineInstr *MI) const {
2441 int FPAtomicToDenormModeWaitStates = 3;
2442
2443 if (!ST.hasFPAtomicToDenormModeHazard())
2444 return 0;
2445 assert(!ST.hasExtendedWaitCounts());
2446
2447 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2448 return 0;
2449
2450 auto IsHazardFn = [](const MachineInstr &I) {
2451 if (!SIInstrInfo::isVMEM(I))
2452 return false;
2453 return SIInstrInfo::isFPAtomic(I);
2454 };
2455
2456 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2457 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2458 return true;
2459
2460 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2461 };
2462
2463 return FPAtomicToDenormModeWaitStates -
2464 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2465}
2466
2467int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2469
2470 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2471}
2472
2473int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2474 // Early exit if no padding is requested.
2475 if (MFMAPaddingRatio == 0)
2476 return 0;
2477
2478 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2479 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2480 return 0;
2481
2482 int NeighborMFMALatency = 0;
2483 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2484 this](const MachineInstr &MI) {
2485 if (!SIInstrInfo::isMFMA(MI))
2486 return false;
2487
2488 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2489 return true;
2490 };
2491
2492 const int MaxMFMAPipelineWaitStates = 16;
2493 int WaitStatesSinceNeighborMFMA =
2494 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2495
2496 int NeighborMFMAPaddingNeeded =
2497 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2498 WaitStatesSinceNeighborMFMA;
2499
2500 return std::max(0, NeighborMFMAPaddingNeeded);
2501}
2502
2503int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2504 int WaitStatesNeeded = 0;
2505 unsigned Opc = MI->getOpcode();
2506
2507 auto IsVALUFn = [](const MachineInstr &MI) {
2508 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2509 };
2510
2511 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2512 const int LegacyVALUWritesVGPRWaitStates = 2;
2513 const int VALUWritesExecWaitStates = 4;
2514 const int MaxWaitStates = 4;
2515
2516 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2517 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2518 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2519
2520 if (WaitStatesNeeded < MaxWaitStates) {
2521 for (const MachineOperand &Use : MI->explicit_uses()) {
2522 const int MaxWaitStates = 2;
2523
2524 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2525 continue;
2526
2527 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2528 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2529 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2530
2531 if (WaitStatesNeeded == MaxWaitStates)
2532 break;
2533 }
2534 }
2535 }
2536
2537 for (const MachineOperand &Op : MI->explicit_operands()) {
2538 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2539 continue;
2540
2541 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2542 continue;
2543
2544 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2545 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2546 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2547 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2548 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2549 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2550 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2551 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2552 const int MaxWaitStates = 18;
2553 Register Reg = Op.getReg();
2554 unsigned HazardDefLatency = 0;
2555
2556 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2557 this](const MachineInstr &MI) {
2558 if (!SIInstrInfo::isMFMA(MI))
2559 return false;
2560 Register DstReg = MI.getOperand(0).getReg();
2561 if (DstReg == Reg)
2562 return false;
2563 HazardDefLatency =
2564 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2565 return TRI.regsOverlap(DstReg, Reg);
2566 };
2567
2568 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2569 MaxWaitStates);
2570 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2571 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2572 int OpNo = Op.getOperandNo();
2573 if (OpNo == SrcCIdx) {
2574 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2575 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2576 switch (HazardDefLatency) {
2577 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2578 break;
2579 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2580 break;
2581 case 16: [[fallthrough]];
2582 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2583 break;
2584 }
2585 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2586 switch (HazardDefLatency) {
2587 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2588 break;
2589 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2590 break;
2591 case 16: [[fallthrough]];
2592 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2593 break;
2594 }
2595 }
2596
2597 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2599
2600 if (WaitStatesNeeded == MaxWaitStates)
2601 return WaitStatesNeeded; // Early exit.
2602
2603 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2604 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2605 return false;
2606 Register DstReg = MI.getOperand(0).getReg();
2607 return TRI.regsOverlap(Reg, DstReg);
2608 };
2609
2610 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2611 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2612 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2613 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2614 if (OpNo == SrcCIdx)
2615 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2616 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2617 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2618
2619 WaitStatesNeededForUse = NeedWaitStates -
2620 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2621 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2622
2623 if (WaitStatesNeeded == MaxWaitStates)
2624 return WaitStatesNeeded; // Early exit.
2625 }
2626
2627 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2628 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2629 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2630 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2631 const int MaxWaitStates = 13;
2632 Register DstReg = MI->getOperand(0).getReg();
2633 unsigned HazardDefLatency = 0;
2634
2635 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2636 this](const MachineInstr &MI) {
2637 if (!SIInstrInfo::isMFMA(MI))
2638 return false;
2639 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2640 HazardDefLatency =
2641 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2642 return TRI.regsOverlap(Reg, DstReg);
2643 };
2644
2645 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2646 int NeedWaitStates;
2647 switch (HazardDefLatency) {
2648 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2649 break;
2650 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2651 break;
2652 case 16: [[fallthrough]];
2653 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2654 break;
2655 }
2656
2657 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2658 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2659 }
2660
2661 // Pad neighboring MFMA with noops for better inter-wave performance.
2662 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2663
2664 return WaitStatesNeeded;
2665}
2666
2667static int
2669 bool IsGFX950) {
2670 // xdl def cycles | gfx940 | gfx950
2671 // 2 pass | 3 4
2672 // 4 pass | 5 6
2673 // 8 pass | 9 10
2674 // 16 pass | 17 18
2675 return NumPasses + 1 + IsGFX950;
2676}
2677
2678static int
2680 bool IsGFX950) {
2681 // xdl def cycles | gfx940 | gfx950
2682 // 2 pass | 3 3
2683 // 4 pass | 5 6
2684 // 8 pass | 9 10
2685 // 16 pass | 17 18
2686 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2687}
2688
2689static int
2691 // 2 pass -> 2
2692 // 4 pass -> 4
2693 // 8 pass -> 8
2694 // 16 pass -> 16
2695 return NumPasses;
2696}
2697
2698static int
2700 // 2 pass -> 4
2701 // 4 pass -> 6
2702 // 8 pass -> 10
2703 // 16 pass -> 18
2704 return NumPasses + 2;
2705}
2706
2708 bool IsGFX950) {
2709 // xdl def cycles | gfx942 | gfx950
2710 // 2 pass | 5 5
2711 // 4 pass | 7 8
2712 // 8 pass | 11 12
2713 // 16 pass | 19 20
2714 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2715}
2716
2717int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2718 int WaitStatesNeeded = 0;
2719 unsigned Opc = MI->getOpcode();
2720
2721 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2723 };
2724
2725 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2728 };
2729
2730 if (!SIInstrInfo::isMFMA(*MI))
2731 return WaitStatesNeeded;
2732
2733 const int VALUWritesExecWaitStates = 4;
2734 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2735 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2736 VALUWritesExecWaitStates);
2737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2738
2739 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2740
2741 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2742 for (const MachineOperand &Use : MI->explicit_uses()) {
2743 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2744 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2745 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2746 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2747 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2748 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2749 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2750 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2751 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2752 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2753 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2754 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2755 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2756 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2757 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2758 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2759 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2760 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2761 const int MaxWaitStates = 19;
2762
2763 if (!Use.isReg())
2764 continue;
2765 Register Reg = Use.getReg();
2766 bool FullReg;
2767 const MachineInstr *MI1;
2768
2769 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2770 this](const MachineInstr &MI) {
2771 if (!SIInstrInfo::isMFMA(MI))
2772 return false;
2773 Register DstReg = MI.getOperand(0).getReg();
2774 FullReg = (DstReg == Reg);
2775 MI1 = &MI;
2776 return TRI.regsOverlap(DstReg, Reg);
2777 };
2778
2779 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2780 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2782
2783 int NumWaitStates =
2784 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2785 if (NumWaitStates == std::numeric_limits<int>::max())
2786 continue;
2787
2788 int OpNo = Use.getOperandNo();
2789 unsigned Opc1 = MI1->getOpcode();
2790 int NeedWaitStates = 0;
2791 if (OpNo == SrcCIdx) {
2792 if (!SIInstrInfo::isDGEMM(Opc) &&
2793 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2794 NeedWaitStates = 0;
2795 } else if (FullReg) {
2796 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2797 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2798 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2799 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2800 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2801 else if (ST.hasGFX940Insts() &&
2802 TSchedModel.computeInstrLatency(MI1) == 2)
2803 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2804 } else {
2805 switch (Opc1) {
2806 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2807 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2808 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2809 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2810 if (!TII.isXDL(*MI))
2811 NeedWaitStates =
2812 ST.hasGFX950Insts()
2813 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2814 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2815 break;
2816 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2817 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2818 if (!TII.isXDL(*MI))
2819 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2820 break;
2821 default:
2822 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2823 if (ST.hasGFX940Insts()) {
2824 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2825 break;
2826
2827 NeedWaitStates =
2828 TII.isXDL(*MI1)
2829 ? (TII.isXDL(*MI)
2831 NumPasses, ST.hasGFX950Insts())
2833 NumPasses, ST.hasGFX950Insts()))
2835 NumPasses);
2836 break;
2837 }
2838
2839 switch (NumPasses) {
2840 case 2:
2841 NeedWaitStates =
2843 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2844 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2845 break;
2846 case 8:
2847 NeedWaitStates =
2849 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2850 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2851 break;
2852 case 16:
2853 NeedWaitStates =
2855 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2856 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2857 break;
2858 default:
2859 llvm_unreachable("unexpected number of passes");
2860 }
2861 }
2862 }
2863 } else {
2864 switch (Opc1) {
2865 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2866 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2867 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2868 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2869 NeedWaitStates =
2870 ST.hasGFX950Insts()
2871 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2872 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2873 break;
2874 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2875 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2876 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2877 break;
2878 default:
2879 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2880
2881 if (ST.hasGFX940Insts()) {
2882 NeedWaitStates =
2883 TII.isXDL(*MI1)
2885 NumPasses, ST.hasGFX950Insts())
2887 NumPasses);
2888 break;
2889 }
2890
2891 switch (NumPasses) {
2892 case 2:
2893 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2894 break;
2895 case 4:
2896 llvm_unreachable("unexpected number of passes for mfma");
2897 case 8:
2898 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2899 break;
2900 case 16:
2901 default:
2902 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2903 }
2904 }
2905 }
2906 if (WaitStatesNeeded >= NeedWaitStates)
2907 continue;
2908
2909 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2910 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2911
2912 if (WaitStatesNeeded == MaxWaitStates)
2913 break;
2914 }
2915
2916 // Pad neighboring MFMA with noops for better inter-wave performance.
2917 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2918
2919 return WaitStatesNeeded;
2920}
2921
2922int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2923 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2924 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2925 return 0;
2926
2927 int WaitStatesNeeded = 0;
2928
2929 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2930 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2931 };
2932
2933 for (const MachineOperand &Op : MI->explicit_uses()) {
2934 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2935 continue;
2936
2937 Register Reg = Op.getReg();
2938
2939 const int AccVgprReadLdStWaitStates = 2;
2940 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2941 const int MaxWaitStates = 2;
2942
2943 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2944 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2945 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2946
2947 if (WaitStatesNeeded == MaxWaitStates)
2948 return WaitStatesNeeded; // Early exit.
2949
2950 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2951 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2952 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2953 return false;
2954 auto IsVALUFn = [](const MachineInstr &MI) {
2956 };
2957 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2958 std::numeric_limits<int>::max();
2959 };
2960
2961 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2962 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2963 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2964 }
2965
2966 return WaitStatesNeeded;
2967}
2968
2969int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2970 assert(!ST.hasVcmpxPermlaneHazard() &&
2971 "this is a different vcmpx+permlane hazard");
2972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2973 const SIInstrInfo *TII = ST.getInstrInfo();
2974
2975 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2976 return isVCmpXWritesExec(*TII, *TRI, MI);
2977 };
2978
2979 auto IsVALUFn = [](const MachineInstr &MI) {
2980 return SIInstrInfo::isVALU(MI);
2981 };
2982
2983 const int VCmpXWritesExecWaitStates = 4;
2984 const int VALUWritesVDstWaitStates = 2;
2985 int WaitStatesNeeded = 0;
2986
2987 for (const MachineOperand &Op : MI->explicit_uses()) {
2988 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2989 continue;
2990 Register Reg = Op.getReg();
2991
2992 int WaitStatesSinceDef =
2993 VALUWritesVDstWaitStates -
2994 getWaitStatesSinceDef(Reg, IsVALUFn,
2995 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2996 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2997 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2998 break;
2999 }
3000
3001 int VCmpXHazardWaits =
3002 VCmpXWritesExecWaitStates -
3003 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3004
3005 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3006 return WaitStatesNeeded;
3007}
3008
3010 // 2 pass -> 4
3011 // 4 pass -> 6
3012 // 8 pass -> 10
3013 // 16 pass -> 18
3014 return NumPasses + 2;
3015}
3016
3018 bool IsGFX950) {
3019 // xdl def cycles | gfx942 | gfx950
3020 // 2 pass | 5 5
3021 // 4 pass | 7 8
3022 // 8 pass | 11 12
3023 // 16 pass | 19 20
3024 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3025}
3026
3028 bool IsGFX950) {
3029 // xdl def cycles | gfx942 | gfx950
3030 // 2 pass | 5 5
3031 // 4 pass | 7 8
3032 // 8 pass | 11 12
3033 // 16 pass | 19 20
3034 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3035}
3036
3038 // 2 pass -> 4
3039 // 4 pass -> 6
3040 // 8 pass -> 10
3041 // 16 pass -> 18
3042 return NumPasses + 2;
3043}
3044
3045int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3046 if (!ST.hasGFX90AInsts())
3047 return 0;
3048
3049 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3050 return SIInstrInfo::isDGEMM(MI.getOpcode());
3051 };
3052
3053 // This is checked in checkMAIHazards90A()
3054 if (SIInstrInfo::isMFMA(*MI))
3055 return 0;
3056
3057 const MachineRegisterInfo &MRI = MF.getRegInfo();
3058
3059 int WaitStatesNeeded = 0;
3060
3061 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3062 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3063 bool IsVALU = SIInstrInfo::isVALU(*MI);
3064
3065 const MachineInstr *MFMA = nullptr;
3066 unsigned Reg;
3067 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3068 if (!SIInstrInfo::isMFMA(MI) ||
3069 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3070 return false;
3071 MFMA = &MI;
3072 return true;
3073 };
3074
3075 const MachineInstr *DOT = nullptr;
3076 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3077 if (!SIInstrInfo::isDOT(MI) ||
3078 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3079 return false;
3080 DOT = &MI;
3081 return true;
3082 };
3083
3084 bool DGEMMAfterVALUWrite = false;
3085 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3086 // Found DGEMM on reverse traversal to def.
3087 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3088 DGEMMAfterVALUWrite = true;
3089
3090 // Only hazard if register is defined by a VALU and a DGEMM is found after
3091 // after the def.
3092 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3093 return false;
3094
3095 return true;
3096 };
3097
3098 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3099 AMDGPU::OpName::src2);
3100
3101 if (IsMemOrExport || IsVALU) {
3102 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3103 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3104 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3105 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3106 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3107 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3108 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3109 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3110 const int DotWriteSameDotReadSrcAB = 3;
3111 const int DotWriteDifferentVALURead = 3;
3112 const int DMFMABetweenVALUWriteVMEMRead = 2;
3113 const int MaxWaitStates = 19;
3114
3115 for (const MachineOperand &Use : MI->explicit_uses()) {
3116 if (!Use.isReg())
3117 continue;
3118 Reg = Use.getReg();
3119
3120 DOT = nullptr;
3121 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3122 MaxWaitStates);
3123 if (DOT) {
3124 int NeedWaitStates = 0;
3125 if (DOT->getOpcode() == MI->getOpcode()) {
3126 if (&Use - &MI->getOperand(0) != SrcCIdx)
3127 NeedWaitStates = DotWriteSameDotReadSrcAB;
3128 } else {
3129 NeedWaitStates = DotWriteDifferentVALURead;
3130 }
3131
3132 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3133 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3134 }
3135
3136 // Workaround for HW data hazard bug observed only in GFX90A. When there
3137 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3138 // causes the SQ to incorrectly not insert two wait states between the two
3139 // instructions needed to avoid data hazard.
3140 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3141 DGEMMAfterVALUWrite = false;
3142 if (TRI.isVectorRegister(MRI, Reg)) {
3143 int WaitStatesNeededForUse =
3144 DMFMABetweenVALUWriteVMEMRead -
3145 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3146 DMFMABetweenVALUWriteVMEMRead);
3147
3148 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3149 }
3150 }
3151
3152 MFMA = nullptr;
3153 WaitStatesSinceDef =
3154 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3155 if (!MFMA)
3156 continue;
3157
3158 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3159 int NumPasses = HazardDefLatency;
3160 int NeedWaitStates = MaxWaitStates;
3161
3162 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3163 switch (HazardDefLatency) {
3164 case 4:
3165 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3166 : DMFMA4x4WriteVgprVALUReadWaitStates;
3167 break;
3168 case 8:
3169 case 16:
3170 NeedWaitStates =
3171 IsMemOrExport
3172 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3173 : (ST.hasGFX950Insts()
3174 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3175 : DMFMA16x16WriteVgprVALUReadWaitStates);
3176 break;
3177 default:
3178 llvm_unreachable("unexpected dgemm");
3179 }
3180 } else if (ST.hasGFX940Insts()) {
3181 NeedWaitStates =
3182 TII.isXDL(*MFMA)
3184 NumPasses, ST.hasGFX950Insts())
3186 NumPasses);
3187 } else {
3188 switch (HazardDefLatency) {
3189 case 2:
3190 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3191 break;
3192 case 8:
3193 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3194 break;
3195 case 16:
3196 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3197 break;
3198 default:
3199 llvm_unreachable("unexpected number of passes for mfma");
3200 }
3201 }
3202
3203 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3204 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3205
3206 if (WaitStatesNeeded == MaxWaitStates)
3207 break;
3208 }
3209 }
3210
3211 unsigned Opc = MI->getOpcode();
3212 const int DMFMAToFMA64WaitStates = 2;
3213 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3214 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3215 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3216 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3217 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3218 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3219 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3220 }
3221
3222 if (!IsVALU && !IsMemOrExport)
3223 return WaitStatesNeeded;
3224
3225 for (const MachineOperand &Def : MI->defs()) {
3226 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3227 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3228 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3229 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3230 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3231 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3232 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3233 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3234 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3235 const int DotWriteDifferentVALUWrite = 3;
3236 const int MaxWaitStates = 19;
3237 const int MaxWarWaitStates = 15;
3238
3239 Reg = Def.getReg();
3240
3241 DOT = nullptr;
3242 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3243 MaxWaitStates);
3244 if (DOT && DOT->getOpcode() != MI->getOpcode())
3245 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3246 WaitStatesSinceDef);
3247
3248 MFMA = nullptr;
3249 WaitStatesSinceDef =
3250 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3251 if (MFMA) {
3252 int NeedWaitStates = MaxWaitStates;
3253 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3254
3255 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3256 switch (NumPasses) {
3257 case 4:
3258 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3259 break;
3260 case 8:
3261 case 16:
3262 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3263 break;
3264 default:
3265 llvm_unreachable("unexpected number of cycles for dgemm");
3266 }
3267 } else if (ST.hasGFX940Insts()) {
3268 NeedWaitStates =
3269 TII.isXDL(*MFMA)
3271 NumPasses, ST.hasGFX950Insts())
3273 } else {
3274 switch (NumPasses) {
3275 case 2:
3276 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3277 break;
3278 case 8:
3279 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3280 break;
3281 case 16:
3282 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3283 break;
3284 default:
3285 llvm_unreachable("Unexpected number of passes for mfma");
3286 }
3287 }
3288
3289 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3290 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3291
3292 if (WaitStatesNeeded == MaxWaitStates)
3293 break;
3294 }
3295
3296 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3297 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3298 !MI.readsRegister(Reg, &TRI))
3299 return false;
3300
3301 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3302 return false;
3303
3304 const MachineOperand *SrcC =
3305 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3306 assert(SrcC);
3307 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3308 return false;
3309
3310 MFMA = &MI;
3311 return true;
3312 };
3313
3314 MFMA = nullptr;
3315 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3316 MaxWarWaitStates);
3317 if (!MFMA)
3318 continue;
3319
3320 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3321 int NeedWaitStates = MaxWaitStates;
3322 switch (HazardDefLatency) {
3323 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3324 break;
3325 case 4: assert(ST.hasGFX940Insts());
3326 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3327 break;
3328 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3329 break;
3330 case 16: [[fallthrough]];
3331 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3332 break;
3333 }
3334
3335 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3336 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3337 }
3338
3339 return WaitStatesNeeded;
3340}
3341
3343 if (!SU->isInstr())
3344 return false;
3345
3346 const MachineInstr *MAI = nullptr;
3347
3348 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3349 MAI = nullptr;
3351 MAI = &MI;
3352 return MAI != nullptr;
3353 };
3354
3355 MachineInstr *MI = SU->getInstr();
3356 if (IsMFMAFn(*MI)) {
3357 int W = getWaitStatesSince(IsMFMAFn, 16);
3358 if (MAI)
3359 return W < (int)TSchedModel.computeInstrLatency(MAI);
3360 }
3361
3362 return false;
3363}
3364
3365// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3366// insertion of a new instruction.
3367static void updateGetPCBundle(MachineInstr *NewMI) {
3368 if (!NewMI->isBundled())
3369 return;
3370
3371 // Find start of bundle.
3372 auto I = NewMI->getIterator();
3373 while (I->isBundledWithPred())
3374 I--;
3375 if (I->isBundle())
3376 I++;
3377
3378 // Bail if this is not an S_GETPC bundle.
3379 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3380 return;
3381
3382 // Update offsets of any references in the bundle.
3383 const unsigned NewBytes = 4;
3384 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3385 "Unexpected instruction insertion in bundle");
3386 auto NextMI = std::next(NewMI->getIterator());
3387 auto End = NewMI->getParent()->end();
3388 while (NextMI != End && NextMI->isBundledWithPred()) {
3389 for (auto &Operand : NextMI->operands()) {
3390 if (Operand.isGlobal())
3391 Operand.setOffset(Operand.getOffset() + NewBytes);
3392 }
3393 NextMI++;
3394 }
3395}
3396
3397bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3398 if (!ST.hasVALUMaskWriteHazard())
3399 return false;
3400 assert(!ST.hasExtendedWaitCounts());
3401
3402 if (!ST.isWave64())
3403 return false;
3404
3405 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3406 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3407 if (!IsSALU && !IsVALU)
3408 return false;
3409
3410 // The hazard sequence is three instructions:
3411 // 1. VALU reads SGPR as mask
3412 // 2. VALU/SALU writes SGPR
3413 // 3. VALU/SALU reads SGPR
3414 // The hazard can expire if the distance between 2 and 3 is sufficient,
3415 // or (2) is VALU and (3) is SALU.
3416 // In practice this happens <10% of the time, hence always assume the hazard
3417 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3418
3419 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3420 const MachineRegisterInfo &MRI = MF.getRegInfo();
3421
3422 auto IgnoreableSGPR = [](const Register Reg) {
3423 switch (Reg) {
3424 case AMDGPU::EXEC:
3425 case AMDGPU::EXEC_LO:
3426 case AMDGPU::EXEC_HI:
3427 case AMDGPU::M0:
3428 case AMDGPU::SGPR_NULL:
3429 case AMDGPU::SGPR_NULL64:
3430 case AMDGPU::SCC:
3431 return true;
3432 default:
3433 return false;
3434 }
3435 };
3436 auto IsVCC = [](const Register Reg) {
3437 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3438 };
3439
3440 struct StateType {
3441 SmallSet<Register, 2> HazardSGPRs;
3442
3443 static unsigned getHashValue(const StateType &State) {
3444 return hash_combine_range(State.HazardSGPRs);
3445 }
3446 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3447 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3448 }
3449 };
3450
3451 SmallVector<const MachineInstr *> WaitInstrs;
3452 bool HasSGPRRead = false;
3453 StateType InitialState;
3454
3455 // Look for SGPR write.
3456 MachineOperand *HazardDef = nullptr;
3457 for (MachineOperand &Op : MI->operands()) {
3458 if (!Op.isReg())
3459 continue;
3460 if (Op.isDef() && HazardDef)
3461 continue;
3462
3463 Register Reg = Op.getReg();
3464 if (IgnoreableSGPR(Reg))
3465 continue;
3466 if (!IsVCC(Reg)) {
3467 if (Op.isImplicit())
3468 continue;
3469 if (!TRI->isSGPRReg(MRI, Reg))
3470 continue;
3471 }
3472 // Also check for SGPR reads.
3473 if (Op.isUse()) {
3474 HasSGPRRead = true;
3475 continue;
3476 }
3477
3478 assert(!HazardDef);
3479 HazardDef = &Op;
3480 }
3481
3482 if (!HazardDef)
3483 return false;
3484
3485 // Setup to track writes to individual SGPRs
3486 const Register HazardReg = HazardDef->getReg();
3487 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3488 InitialState.HazardSGPRs.insert(HazardReg);
3489 } else {
3490 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3491 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3492 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3493 }
3494
3495 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3496 if (State.HazardSGPRs.empty())
3497 return HazardExpired;
3498
3499 switch (I.getOpcode()) {
3500 case AMDGPU::V_ADDC_U32_e32:
3501 case AMDGPU::V_ADDC_U32_dpp:
3502 case AMDGPU::V_CNDMASK_B16_t16_e32:
3503 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3504 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3505 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3506 case AMDGPU::V_CNDMASK_B32_e32:
3507 case AMDGPU::V_CNDMASK_B32_dpp:
3508 case AMDGPU::V_DIV_FMAS_F32_e64:
3509 case AMDGPU::V_DIV_FMAS_F64_e64:
3510 case AMDGPU::V_SUBB_U32_e32:
3511 case AMDGPU::V_SUBB_U32_dpp:
3512 case AMDGPU::V_SUBBREV_U32_e32:
3513 case AMDGPU::V_SUBBREV_U32_dpp: {
3514 // These implicitly read VCC as mask source.
3515 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3516 }
3517 case AMDGPU::V_ADDC_U32_e64:
3518 case AMDGPU::V_ADDC_U32_e64_dpp:
3519 case AMDGPU::V_CNDMASK_B16_t16_e64:
3520 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3521 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3522 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3523 case AMDGPU::V_CNDMASK_B32_e64:
3524 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3525 case AMDGPU::V_SUBB_U32_e64:
3526 case AMDGPU::V_SUBB_U32_e64_dpp:
3527 case AMDGPU::V_SUBBREV_U32_e64:
3528 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3529 // Only check mask register overlaps.
3530 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3531 assert(SSRCOp);
3532 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3533 return Result ? HazardFound : NoHazardFound;
3534 }
3535 default:
3536 return NoHazardFound;
3537 }
3538 };
3539
3540 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3542 0),
3543 0);
3544 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3545 switch (I.getOpcode()) {
3546 case AMDGPU::S_WAITCNT_DEPCTR:
3547 // Record mergable waits within region of instructions free of SGPR reads.
3548 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3549 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3550 WaitInstrs.push_back(&I);
3551 break;
3552 default:
3553 // Update tracking of SGPR reads and writes.
3554 for (auto &Op : I.operands()) {
3555 if (!Op.isReg())
3556 continue;
3557
3558 Register Reg = Op.getReg();
3559 if (IgnoreableSGPR(Reg))
3560 continue;
3561 if (!IsVCC(Reg)) {
3562 if (Op.isImplicit())
3563 continue;
3564 if (!TRI->isSGPRReg(MRI, Reg))
3565 continue;
3566 }
3567 if (Op.isUse()) {
3568 HasSGPRRead = true;
3569 continue;
3570 }
3571
3572 // Stop tracking any SGPRs with writes on the basis that they will
3573 // already have an appropriate wait inserted afterwards.
3575 for (Register SGPR : State.HazardSGPRs) {
3576 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3577 Found.push_back(SGPR);
3578 }
3579 for (Register SGPR : Found)
3580 State.HazardSGPRs.erase(SGPR);
3581 }
3582 break;
3583 }
3584 };
3585
3586 // Check for hazard
3587 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3588 MI->getParent(),
3589 std::next(MI->getReverseIterator())))
3590 return false;
3591
3592 // Compute counter mask
3593 unsigned DepCtr =
3594 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3595 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3596 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3597
3598 // Try to merge previous waits into this one for regions with no SGPR reads.
3599 if (!WaitInstrs.empty()) {
3600 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3601 // obtain a mutable pointer to each instruction to be merged.
3602 // This is expected to be a very short walk within the same block.
3603 SmallVector<MachineInstr *> ToErase;
3604 unsigned Found = 0;
3605 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3606 End = MI->getParent()->rend();
3607 Found < WaitInstrs.size() && It != End; ++It) {
3608 MachineInstr *WaitMI = &*It;
3609 // Find next wait instruction.
3610 if (std::as_const(WaitMI) != WaitInstrs[Found])
3611 continue;
3612 Found++;
3613 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3614 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3615 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3616 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3617 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3618 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3619 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3620 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3621 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3622 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3623 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3624 ToErase.push_back(WaitMI);
3625 }
3626 assert(Found == WaitInstrs.size());
3627 for (MachineInstr *WaitMI : ToErase)
3628 WaitMI->eraseFromParent();
3629 }
3630
3631 // Add s_waitcnt_depctr after SGPR write.
3632 auto NextMI = std::next(MI->getIterator());
3633 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3634 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3635 .addImm(DepCtr);
3636
3637 // SALU write may be s_getpc in a bundle.
3638 updateGetPCBundle(NewMI);
3639
3640 return true;
3641}
3642
3643static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3644 const SIInstrInfo &TII) {
3645 MachineBasicBlock &EntryMBB = MF->front();
3646 if (EntryMBB.begin() != EntryMBB.end()) {
3647 auto &EntryMI = *EntryMBB.begin();
3648 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3649 EntryMI.getOperand(0).getImm() >= Priority)
3650 return false;
3651 }
3652
3653 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3654 .addImm(Priority);
3655 return true;
3656}
3657
3658bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3659 if (!ST.hasRequiredExportPriority())
3660 return false;
3661
3662 // Assume the following shader types will never have exports,
3663 // and avoid adding or adjusting S_SETPRIO.
3664 MachineBasicBlock *MBB = MI->getParent();
3665 MachineFunction *MF = MBB->getParent();
3666 auto CC = MF->getFunction().getCallingConv();
3667 switch (CC) {
3672 return false;
3673 default:
3674 break;
3675 }
3676
3677 const int MaxPriority = 3;
3678 const int NormalPriority = 2;
3679 const int PostExportPriority = 0;
3680
3681 auto It = MI->getIterator();
3682 switch (MI->getOpcode()) {
3683 case AMDGPU::S_ENDPGM:
3684 case AMDGPU::S_ENDPGM_SAVED:
3685 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3686 case AMDGPU::SI_RETURN_TO_EPILOG:
3687 // Ensure shader with calls raises priority at entry.
3688 // This ensures correct priority if exports exist in callee.
3689 if (MF->getFrameInfo().hasCalls())
3690 return ensureEntrySetPrio(MF, NormalPriority, TII);
3691 return false;
3692 case AMDGPU::S_SETPRIO: {
3693 // Raise minimum priority unless in workaround.
3694 auto &PrioOp = MI->getOperand(0);
3695 int Prio = PrioOp.getImm();
3696 bool InWA = (Prio == PostExportPriority) &&
3697 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3698 if (InWA || Prio >= NormalPriority)
3699 return false;
3700 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3701 return true;
3702 }
3703 default:
3704 if (!TII.isEXP(*MI))
3705 return false;
3706 break;
3707 }
3708
3709 // Check entry priority at each export (as there will only be a few).
3710 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3711 bool Changed = false;
3713 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3714
3715 auto NextMI = std::next(It);
3716 bool EndOfShader = false;
3717 if (NextMI != MBB->end()) {
3718 // Only need WA at end of sequence of exports.
3719 if (TII.isEXP(*NextMI))
3720 return Changed;
3721 // Assume appropriate S_SETPRIO after export means WA already applied.
3722 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3723 NextMI->getOperand(0).getImm() == PostExportPriority)
3724 return Changed;
3725 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3726 }
3727
3728 const DebugLoc &DL = MI->getDebugLoc();
3729
3730 // Lower priority.
3731 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3732 .addImm(PostExportPriority);
3733
3734 if (!EndOfShader) {
3735 // Wait for exports to complete.
3736 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3737 .addReg(AMDGPU::SGPR_NULL)
3738 .addImm(0);
3739 }
3740
3741 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3742 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3743
3744 if (!EndOfShader) {
3745 // Return to normal (higher) priority.
3746 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3747 .addImm(NormalPriority);
3748 }
3749
3750 return true;
3751}
3752
3753bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3754 if (!isSGetReg(MI->getOpcode()))
3755 return false;
3756
3757 const SIInstrInfo *TII = ST.getInstrInfo();
3758 switch (getHWReg(TII, *MI)) {
3759 default:
3760 return false;
3765 break;
3766 }
3767
3768 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3769 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3770 .addImm(0);
3771 return true;
3772}
3773
3774bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3775 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3776 return false;
3777
3778 const SIInstrInfo *TII = ST.getInstrInfo();
3779 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3780 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3782 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3783 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3785
3786 return true;
3787}
3788
3789bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3790 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3791 // for hazard to trigger.
3792 if (!IsHazardRecognizerMode)
3793 return false;
3794
3795 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3796 const SIInstrInfo *TII = ST.getInstrInfo();
3797 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3798 const int FlatScrBaseWaitStates = 10;
3799
3800 bool ReadsFlatScrLo =
3801 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3802 bool ReadsFlatScrHi =
3803 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3804 if (isSGetReg(MI->getOpcode())) {
3805 switch (getHWReg(TII, *MI)) {
3806 default:
3807 break;
3809 ReadsFlatScrLo = true;
3810 break;
3812 ReadsFlatScrHi = true;
3813 break;
3814 }
3815 }
3816
3817 const MachineRegisterInfo &MRI = MF.getRegInfo();
3818
3819 auto IsRegDefHazard = [&](Register Reg) -> bool {
3820 DenseSet<const MachineBasicBlock *> Visited;
3821 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3822 return MI.modifiesRegister(Reg, TRI);
3823 };
3824
3825 // This literally abuses the idea of waitstates. Instead of waitstates it
3826 // returns 1 for SGPR written and 0 otherwise.
3827 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3828 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3829 return 0;
3830 for (const MachineOperand &MO : MI.all_defs()) {
3831 if (TRI->isSGPRReg(MRI, MO.getReg()))
3832 return 1;
3833 }
3834 return 0;
3835 };
3836
3837 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3838 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3839 unsigned Wait = MI.getOperand(0).getImm();
3842 return true;
3843 }
3844 return SgprWrites >= FlatScrBaseWaitStates;
3845 };
3846
3847 return ::getWaitStatesSince(
3848 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3849 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3850 };
3851
3852 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3853 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3854 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3855 !IsRegDefHazard(AMDGPU::SGPR103)))
3856 return false;
3857
3858 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3859 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3862 return true;
3863}
3864
3865bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3866 if (!isSSetReg(MI->getOpcode()) ||
3867 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3868 return false;
3869
3870 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3871 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3872 return true;
3873}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition CommandLine.h:52
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...