LLVM 22.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (MI.getOperand(GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
161}
162
163static bool isLdsDma(const MachineInstr &MI) {
164 return SIInstrInfo::isVALU(MI) &&
166}
167
168static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
169 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
170 AMDGPU::OpName::simm16);
171 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
172}
173
176 MachineInstr *MI = SU->getInstr();
177 // If we are not in "HazardRecognizerMode" and therefore not being run from
178 // the scheduler, track possible stalls from hazards but don't insert noops.
179 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
180
181 if (MI->isBundle())
182 return NoHazard;
183
184 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
185 return HazardType;
186
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
188 return HazardType;
189
190 if (checkFPAtomicToDenormModeHazard(MI) > 0)
191 return HazardType;
192
193 // Hazards which cannot be mitigated with S_NOPs.
194 if (!IsHazardRecognizerMode) {
195 if (checkWMMACoexecutionHazards(MI) > 0)
196 return Hazard;
197 }
198
199 if (ST.hasNoDataDepHazard())
200 return NoHazard;
201
202 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
203 return HazardType;
204
205 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
206 return HazardType;
207
208 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
209 return HazardType;
210
211 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
212 return HazardType;
213
214 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
215 return HazardType;
216
219 checkMAIVALUHazards(MI) > 0)
220 return HazardType;
221
222 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
223 return HazardType;
224
225 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
226 return HazardType;
227
228 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
229 return HazardType;
230
231 if (((ST.hasReadM0MovRelInterpHazard() &&
232 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
233 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
234 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
235 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
236 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
237 (ST.hasReadM0LdsDirectHazard() &&
238 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
239 checkReadM0Hazards(MI) > 0)
240 return HazardType;
241
242 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
243 return HazardType;
244
246 checkMAILdStHazards(MI) > 0)
247 return HazardType;
248
249 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
250 return HazardType;
251
252 return NoHazard;
253}
254
256 unsigned Quantity) {
257 while (Quantity > 0) {
258 unsigned Arg = std::min(Quantity, 8u);
259 Quantity -= Arg;
260 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
261 .addImm(Arg - 1);
262 }
263}
264
265unsigned
266GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
267 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
268 assert(TSchedModel.getWriteProcResBegin(SC) !=
269 TSchedModel.getWriteProcResEnd(SC));
270 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
271}
272
273void GCNHazardRecognizer::processBundle() {
274 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
275 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
276 // Check bundled MachineInstr's for hazards.
277 for (; MI != E && MI->isInsideBundle(); ++MI) {
278 CurrCycleInstr = &*MI;
279 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
280
281 if (IsHazardRecognizerMode) {
282 fixHazards(CurrCycleInstr);
283
284 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
285 }
286
287 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
288 // include the bundled MI directly after, only add a maximum of
289 // (MaxLookAhead - 1) noops to EmittedInstrs.
290 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
291 EmittedInstrs.push_front(nullptr);
292
293 EmittedInstrs.push_front(CurrCycleInstr);
294 EmittedInstrs.resize(MaxLookAhead);
295 }
296 CurrCycleInstr = nullptr;
297}
298
299void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
300 assert(IsHazardRecognizerMode);
301
302 unsigned NumPreNoops = PreEmitNoops(MI);
303 EmitNoops(NumPreNoops);
304 if (MI->isInsideBundle())
305 insertNoopsInBundle(MI, TII, NumPreNoops);
306 else
307 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
308 NumPreNoops);
310 AdvanceCycle();
311}
312
314 IsHazardRecognizerMode = true;
315 CurrCycleInstr = MI;
316 unsigned W = PreEmitNoopsCommon(MI);
317 fixHazards(MI);
318 CurrCycleInstr = nullptr;
319 return std::max(W, NopPadding.getValue());
320}
321
323 if (MI->isBundle())
324 return 0;
325
326 int WaitStates = 0;
327
329 return std::max(WaitStates, checkSMRDHazards(MI));
330
331 if (ST.hasNSAtoVMEMBug())
332 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
333
334 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
335
336 if (ST.hasNoDataDepHazard())
337 return WaitStates;
338
340 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
341
343 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
344
346 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
347
348 if (isDivFMas(MI->getOpcode()))
349 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
350
351 if (isRWLane(MI->getOpcode()))
352 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
353
356 checkMAIVALUHazards(MI) > 0)
357 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
358
359 if (MI->isInlineAsm())
360 return std::max(WaitStates, checkInlineAsmHazards(MI));
361
362 if (isSGetReg(MI->getOpcode()))
363 return std::max(WaitStates, checkGetRegHazards(MI));
364
365 if (isSSetReg(MI->getOpcode()))
366 return std::max(WaitStates, checkSetRegHazards(MI));
367
368 if (isRFE(MI->getOpcode()))
369 return std::max(WaitStates, checkRFEHazards(MI));
370
371 if ((ST.hasReadM0MovRelInterpHazard() &&
372 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
373 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
374 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
375 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
376 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
377 (ST.hasReadM0LdsDirectHazard() &&
378 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
379 return std::max(WaitStates, checkReadM0Hazards(MI));
380
382 return std::max(WaitStates, checkMAIHazards(MI));
383
385 return std::max(WaitStates, checkMAILdStHazards(MI));
386
387 if (ST.hasGFX950Insts() && isPermlane(*MI))
388 return std::max(WaitStates, checkPermlaneHazards(MI));
389
390 return WaitStates;
391}
392
394 EmittedInstrs.push_front(nullptr);
395}
396
398 // When the scheduler detects a stall, it will call AdvanceCycle() without
399 // emitting any instructions.
400 if (!CurrCycleInstr) {
401 EmittedInstrs.push_front(nullptr);
402 return;
403 }
404
405 if (CurrCycleInstr->isBundle()) {
406 processBundle();
407 return;
408 }
409
410 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
411 if (!NumWaitStates) {
412 CurrCycleInstr = nullptr;
413 return;
414 }
415
416 // Keep track of emitted instructions
417 EmittedInstrs.push_front(CurrCycleInstr);
418
419 // Add a nullptr for each additional wait state after the first. Make sure
420 // not to add more than getMaxLookAhead() items to the list, since we
421 // truncate the list to that size right after this loop.
422 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
423 i < e; ++i) {
424 EmittedInstrs.push_front(nullptr);
425 }
426
427 // getMaxLookahead() is the largest number of wait states we will ever need
428 // to insert, so there is no point in keeping track of more than that many
429 // wait states.
430 EmittedInstrs.resize(getMaxLookAhead());
431
432 CurrCycleInstr = nullptr;
433}
434
436 assert(!IsHazardRecognizerMode &&
437 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
438}
439
440//===----------------------------------------------------------------------===//
441// Helper Functions
442//===----------------------------------------------------------------------===//
443
445
446// Search for a hazard in a block and its predecessors.
447template <typename StateT>
448static bool
449hasHazard(StateT InitialState,
450 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
451 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
452 const MachineBasicBlock *InitialMBB,
454 struct StateMapKey {
456 unsigned Idx;
457 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
458 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
459 }
460 };
461 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
462 static inline StateMapKey getEmptyKey() {
463 return {static_cast<SmallVectorImpl<StateT> *>(
466 }
467 static inline StateMapKey getTombstoneKey() {
468 return {static_cast<SmallVectorImpl<StateT> *>(
471 }
472 static unsigned getHashValue(const StateMapKey &Key) {
473 return StateT::getHashValue((*Key.States)[Key.Idx]);
474 }
475 static unsigned getHashValue(const StateT &State) {
476 return StateT::getHashValue(State);
477 }
478 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
479 const auto EKey = getEmptyKey();
480 const auto TKey = getTombstoneKey();
481 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
482 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
483 return StateMapKey::isEqual(LHS, RHS);
484 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
485 }
486 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
487 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
488 StateMapKey::isEqual(RHS, getTombstoneKey()))
489 return false;
490 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
491 }
492 };
493
496
498 const MachineBasicBlock *MBB = InitialMBB;
499 StateT State = InitialState;
500
502 unsigned WorkIdx = 0;
503 for (;;) {
504 bool Expired = false;
505 for (auto E = MBB->instr_rend(); I != E; ++I) {
506 // No need to look at parent BUNDLE instructions.
507 if (I->isBundle())
508 continue;
509
510 auto Result = IsHazard(State, *I);
511 if (Result == HazardFound)
512 return true;
513 if (Result == HazardExpired) {
514 Expired = true;
515 break;
516 }
517
518 if (I->isInlineAsm() || I->isMetaInstruction())
519 continue;
520
521 UpdateState(State, *I);
522 }
523
524 if (!Expired) {
525 unsigned StateIdx = States.size();
526 StateMapKey Key = {&States, StateIdx};
527 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
528 if (Insertion.second) {
529 States.emplace_back(State);
530 } else {
531 StateIdx = Insertion.first->second;
532 }
533 for (MachineBasicBlock *Pred : MBB->predecessors())
534 Worklist.insert(std::pair(Pred, StateIdx));
535 }
536
537 if (WorkIdx == Worklist.size())
538 break;
539
540 unsigned StateIdx;
541 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
542 State = States[StateIdx];
543 I = MBB->instr_rbegin();
544 }
545
546 return false;
547}
548
549// Returns a minimum wait states since \p I walking all predecessors.
550// Only scans until \p IsExpired does not return true.
551// Can only be run in a hazard recognizer mode.
552static int
554 const MachineBasicBlock *MBB,
556 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
560 for (auto E = MBB->instr_rend(); I != E; ++I) {
561 // Don't add WaitStates for parent BUNDLE instructions.
562 if (I->isBundle())
563 continue;
564
565 if (IsHazard(*I))
566 return WaitStates;
567
568 if (I->isInlineAsm())
569 continue;
570
571 WaitStates += GetNumWaitStates(*I);
572
573 if (IsExpired(*I, WaitStates))
574 return std::numeric_limits<int>::max();
575 }
576
577 int MinWaitStates = std::numeric_limits<int>::max();
578 for (MachineBasicBlock *Pred : MBB->predecessors()) {
579 if (!Visited.insert(Pred).second)
580 continue;
581
582 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
583 IsExpired, Visited, GetNumWaitStates);
584
585 MinWaitStates = std::min(MinWaitStates, W);
586 }
587
588 return MinWaitStates;
589}
590
591static int
593 const MachineInstr *MI,
598 return getWaitStatesSince(IsHazard, MI->getParent(),
599 std::next(MI->getReverseIterator()), 0, IsExpired,
600 Visited, GetNumWaitStates);
601}
602
603int GCNHazardRecognizer::getWaitStatesSince(
604 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
605 if (IsHazardRecognizerMode) {
606 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
607 return WaitStates >= Limit;
608 };
609 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
610 GetNumWaitStates);
611 }
612
613 int WaitStates = 0;
614 for (MachineInstr *MI : EmittedInstrs) {
615 if (MI) {
616 if (IsHazard(*MI))
617 return WaitStates;
618
619 if (MI->isInlineAsm())
620 continue;
621 }
622 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
623
624 if (WaitStates >= Limit)
625 break;
626 }
627 return std::numeric_limits<int>::max();
628}
629
630int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
631 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
632}
633
634int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
635 IsHazardFn IsHazardDef,
636 int Limit) {
637 const SIRegisterInfo *TRI = ST.getRegisterInfo();
638
639 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
640 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
641 };
642
643 return getWaitStatesSince(IsHazardFn, Limit);
644}
645
646int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
647 int Limit) {
648 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
649 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
650 };
651
652 return getWaitStatesSince(IsHazardFn, Limit);
653}
654
655//===----------------------------------------------------------------------===//
656// No-op Hazard Detection
657//===----------------------------------------------------------------------===//
658
659static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
660 MCRegister Reg) {
661 for (MCRegUnit Unit : TRI.regunits(Reg))
662 BV.set(static_cast<unsigned>(Unit));
663}
664
665static void addRegsToSet(const SIRegisterInfo &TRI,
667 BitVector &DefSet, BitVector &UseSet) {
668 for (const MachineOperand &Op : Ops) {
669 if (Op.isReg())
670 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
671 }
672}
673
674void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
675 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
676}
677
679 return !SIInstrInfo::isSMRD(*MI);
680}
681
683 return !SIInstrInfo::isVMEM(*MI);
684}
685
686int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
687 // SMEM soft clause are only present on VI+, and only matter if xnack is
688 // enabled.
689 if (!ST.isXNACKEnabled())
690 return 0;
691
692 bool IsSMRD = TII.isSMRD(*MEM);
693
694 resetClause();
695
696 // A soft-clause is any group of consecutive SMEM instructions. The
697 // instructions in this group may return out of order and/or may be
698 // replayed (i.e. the same instruction issued more than once).
699 //
700 // In order to handle these situations correctly we need to make sure that
701 // when a clause has more than one instruction, no instruction in the clause
702 // writes to a register that is read by another instruction in the clause
703 // (including itself). If we encounter this situation, we need to break the
704 // clause by inserting a non SMEM instruction.
705
706 for (MachineInstr *MI : EmittedInstrs) {
707 // When we hit a non-SMEM instruction then we have passed the start of the
708 // clause and we can stop.
709 if (!MI)
710 break;
711
713 break;
714
715 addClauseInst(*MI);
716 }
717
718 if (ClauseDefs.none())
719 return 0;
720
721 // We need to make sure not to put loads and stores in the same clause if they
722 // use the same address. For now, just start a new clause whenever we see a
723 // store.
724 if (MEM->mayStore())
725 return 1;
726
727 addClauseInst(*MEM);
728
729 // If the set of defs and uses intersect then we cannot add this instruction
730 // to the clause, so we have a hazard.
731 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
732}
733
734int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
735 int WaitStatesNeeded = 0;
736
737 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
738
739 // This SMRD hazard only affects SI.
740 if (!ST.hasSMRDReadVALUDefHazard())
741 return WaitStatesNeeded;
742
743 // A read of an SGPR by SMRD instruction requires 4 wait states when the
744 // SGPR was written by a VALU instruction.
745 int SmrdSgprWaitStates = 4;
746 auto IsHazardDefFn = [this](const MachineInstr &MI) {
747 return TII.isVALU(MI);
748 };
749 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
750 return TII.isSALU(MI);
751 };
752
753 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
754
755 for (const MachineOperand &Use : SMRD->uses()) {
756 if (!Use.isReg())
757 continue;
758 int WaitStatesNeededForUse =
759 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
760 SmrdSgprWaitStates);
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762
763 // This fixes what appears to be undocumented hardware behavior in SI where
764 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
765 // needs some number of nops in between. We don't know how many we need, but
766 // let's use 4. This wasn't discovered before probably because the only
767 // case when this happens is when we expand a 64-bit pointer into a full
768 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
769 // probably never encountered in the closed-source land.
770 if (IsBufferSMRD) {
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
773 IsBufferHazardDefFn,
774 SmrdSgprWaitStates);
775 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
776 }
777 }
778
779 return WaitStatesNeeded;
780}
781
782int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
783 if (!ST.hasVMEMReadSGPRVALUDefHazard())
784 return 0;
785
786 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
787
788 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
789 // SGPR was written by a VALU Instruction.
790 const int VmemSgprWaitStates = 5;
791 auto IsHazardDefFn = [this](const MachineInstr &MI) {
792 return TII.isVALU(MI);
793 };
794 for (const MachineOperand &Use : VMEM->uses()) {
795 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
796 continue;
797
798 int WaitStatesNeededForUse =
799 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
800 VmemSgprWaitStates);
801 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
802 }
803 return WaitStatesNeeded;
804}
805
806int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
807 const SIRegisterInfo *TRI = ST.getRegisterInfo();
808 const SIInstrInfo *TII = ST.getInstrInfo();
809
810 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
811 int DppVgprWaitStates = 2;
812 int DppExecWaitStates = 5;
813 int WaitStatesNeeded = 0;
814 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
815 return TII->isVALU(MI);
816 };
817
818 for (const MachineOperand &Use : DPP->uses()) {
819 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
820 continue;
821 int WaitStatesNeededForUse =
822 DppVgprWaitStates - getWaitStatesSinceDef(
823 Use.getReg(),
824 [](const MachineInstr &) { return true; },
825 DppVgprWaitStates);
826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
827 }
828
829 WaitStatesNeeded = std::max(
830 WaitStatesNeeded,
831 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
832 DppExecWaitStates));
833
834 return WaitStatesNeeded;
835}
836
837int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
838 const SIInstrInfo *TII = ST.getInstrInfo();
839
840 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
841 // instruction.
842 const int DivFMasWaitStates = 4;
843 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
844 return TII->isVALU(MI);
845 };
846 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
847 DivFMasWaitStates);
848
849 return DivFMasWaitStates - WaitStatesNeeded;
850}
851
852int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
853 const SIInstrInfo *TII = ST.getInstrInfo();
854 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
855
856 const int GetRegWaitStates = 2;
857 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
858 return GetRegHWReg == getHWReg(TII, MI);
859 };
860 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
861
862 return GetRegWaitStates - WaitStatesNeeded;
863}
864
865int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned HWReg = getHWReg(TII, *SetRegInstr);
868
869 const int SetRegWaitStates = ST.getSetRegWaitStates();
870 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
871 return HWReg == getHWReg(TII, MI);
872 };
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
874 return SetRegWaitStates - WaitStatesNeeded;
875}
876
877int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
878 if (!MI.mayStore())
879 return -1;
880
881 const SIInstrInfo *TII = ST.getInstrInfo();
882 unsigned Opcode = MI.getOpcode();
883 const MCInstrDesc &Desc = MI.getDesc();
884
885 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
886 int VDataRCID = -1;
887 if (VDataIdx != -1)
888 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
889
890 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
891 // There is no hazard if the instruction does not use vector regs
892 // (like wbinvl1)
893 if (VDataIdx == -1)
894 return -1;
895 // For MUBUF/MTBUF instructions this hazard only exists if the
896 // instruction is not using a register in the soffset field.
897 const MachineOperand *SOffset =
898 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
899 // If we have no soffset operand, then assume this field has been
900 // hardcoded to zero.
901 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
902 (!SOffset || !SOffset->isReg()))
903 return VDataIdx;
904 }
905
906 // MIMG instructions create a hazard if they don't use a 256-bit T# and
907 // the store size is greater than 8 bytes and they have more than two bits
908 // of their dmask set.
909 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
910 if (TII->isMIMG(MI)) {
911 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
912 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
913 Desc.operands()[SRsrcIdx])) == 256);
914 (void)SRsrcIdx;
915 }
916
917 if (TII->isFLAT(MI)) {
918 // There is no hazard if the instruction does not use vector regs
919 if (VDataIdx == -1)
920 return -1;
921
922 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
923 return VDataIdx;
924 }
925
926 return -1;
927}
928
929int
930GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
931 const MachineRegisterInfo &MRI) {
932 // Helper to check for the hazard where VMEM instructions that store more than
933 // 8 bytes can have there store data over written by the next instruction.
934 const SIRegisterInfo *TRI = ST.getRegisterInfo();
935
936 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
937 int WaitStatesNeeded = 0;
938
939 if (!TRI->isVectorRegister(MRI, Def.getReg()))
940 return WaitStatesNeeded;
941 Register Reg = Def.getReg();
942 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
943 int DataIdx = createsVALUHazard(MI);
944 return DataIdx >= 0 &&
945 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
946 };
947
948 int WaitStatesNeededForDef =
949 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
950 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
951
952 return WaitStatesNeeded;
953}
954
955/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
956/// pack the computed value into correct bit position of the dest register. This
957/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
958/// dst_sel that is not aligned to the register. This function analayzes the \p
959/// MI and \returns an operand with dst forwarding issue, or nullptr if
960/// none exists.
961static const MachineOperand *
964 return nullptr;
965
966 const SIInstrInfo *TII = ST.getInstrInfo();
967
968 unsigned Opcode = MI.getOpcode();
969
970 // There are three different types of instructions
971 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
972 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
973 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
974 // op_sel[3:2]
975 // != 0
976 if (SIInstrInfo::isSDWA(MI)) {
977 // Type 1: SDWA with dst_sel != DWORD
978 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
979 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
980 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
981 }
982
983 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
984 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
985 // Type 2: VOP3 which write the hi bits
986 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
988 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
989
990 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
991 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
992 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
994 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
995 }
996
997 // Special case: nop is required for all the opsel values for fp4 sr variant
998 // cvt scale instructions
999 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1000 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1001
1002 return nullptr;
1003}
1004
1005/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1006/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1007/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1009 const MachineOperand *Dst,
1010 const SIRegisterInfo *TRI) {
1011 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1012 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1013 // and we must account for that hazard.
1014 // We also must account for WAW hazards. In particular, WAW with dest
1015 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1016 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1017 // check for ECC. Without accounting for this hazard, the ECC will be
1018 // wrong.
1019 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1020 // complete zeroesHigh16BitsOfDest)
1021 for (auto &Operand : VALU->operands()) {
1022 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1023 return true;
1024 }
1025 }
1026 return false;
1027}
1028
1029int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1030 int WaitStatesNeeded = 0;
1031
1032 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1033 const int TransDefWaitstates = 1;
1034
1035 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1037 return false;
1038 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1039 const SIInstrInfo *TII = ST.getInstrInfo();
1040 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1041
1042 for (const MachineOperand &Use : VALU->explicit_uses()) {
1043 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1044 return true;
1045 }
1046
1047 return false;
1048 };
1049
1050 int WaitStatesNeededForDef =
1051 TransDefWaitstates -
1052 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1054 }
1055
1056 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1057 const int Shift16DefWaitstates = 1;
1058
1059 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1060 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1061 const MachineOperand *ForwardedDst =
1062 getDstSelForwardingOperand(ProducerMI, ST);
1063 if (ForwardedDst) {
1064 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1065 }
1066
1067 if (ProducerMI.isInlineAsm()) {
1068 // Assume inline asm has dst forwarding hazard
1069 for (auto &Def : ProducerMI.all_defs()) {
1070 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1071 return true;
1072 }
1073 }
1074
1075 return false;
1076 };
1077
1078 int WaitStatesNeededForDef =
1079 Shift16DefWaitstates -
1080 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1081 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1082 }
1083
1084 if (ST.hasVDecCoExecHazard()) {
1085 const int VALUWriteSGPRVALUReadWaitstates = 2;
1086 const int VALUWriteEXECRWLane = 4;
1087 const int VALUWriteVGPRReadlaneRead = 1;
1088
1089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1090 const MachineRegisterInfo &MRI = MF.getRegInfo();
1092 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1093 if (!SIInstrInfo::isVALU(MI))
1094 return false;
1095 return MI.modifiesRegister(UseReg, TRI);
1096 };
1097
1098 for (const MachineOperand &Use : VALU->explicit_uses()) {
1099 if (!Use.isReg())
1100 continue;
1101
1102 UseReg = Use.getReg();
1103 if (TRI->isSGPRReg(MRI, UseReg)) {
1104 int WaitStatesNeededForDef =
1105 VALUWriteSGPRVALUReadWaitstates -
1106 getWaitStatesSince(IsVALUDefSGPRFn,
1107 VALUWriteSGPRVALUReadWaitstates);
1108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1109 }
1110 }
1111
1112 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1113 UseReg = AMDGPU::VCC;
1114 int WaitStatesNeededForDef =
1115 VALUWriteSGPRVALUReadWaitstates -
1116 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1117 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1118 }
1119
1120 switch (VALU->getOpcode()) {
1121 case AMDGPU::V_READLANE_B32:
1122 case AMDGPU::V_READFIRSTLANE_B32: {
1123 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1124 UseReg = Src->getReg();
1125 int WaitStatesNeededForDef =
1126 VALUWriteVGPRReadlaneRead -
1127 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1129 }
1130 [[fallthrough]];
1131 case AMDGPU::V_WRITELANE_B32: {
1132 UseReg = AMDGPU::EXEC;
1133 int WaitStatesNeededForDef =
1134 VALUWriteEXECRWLane -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1137 break;
1138 }
1139 default:
1140 break;
1141 }
1142 }
1143
1144 // This checks for the hazard where VMEM instructions that store more than
1145 // 8 bytes can have there store data over written by the next instruction.
1146 if (!ST.has12DWordStoreHazard())
1147 return WaitStatesNeeded;
1148
1149 const MachineRegisterInfo &MRI = MF.getRegInfo();
1150
1151 for (const MachineOperand &Def : VALU->defs()) {
1152 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1153 }
1154
1155 return WaitStatesNeeded;
1156}
1157
1158int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1159 // This checks for hazards associated with inline asm statements.
1160 // Since inline asms can contain just about anything, we use this
1161 // to call/leverage other check*Hazard routines. Note that
1162 // this function doesn't attempt to address all possible inline asm
1163 // hazards (good luck), but is a collection of what has been
1164 // problematic thus far.
1165
1166 // see checkVALUHazards()
1167 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1168 !ST.hasCvtScaleForwardingHazard())
1169 return 0;
1170
1171 const MachineRegisterInfo &MRI = MF.getRegInfo();
1172 int WaitStatesNeeded = 0;
1173
1174 for (const MachineOperand &Op :
1176 if (Op.isReg() && Op.isDef()) {
1177 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1178 continue;
1179
1180 if (ST.has12DWordStoreHazard()) {
1181 WaitStatesNeeded =
1182 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1183 }
1184 }
1185 }
1186
1187 if (ST.hasDstSelForwardingHazard()) {
1188 const int Shift16DefWaitstates = 1;
1189
1190 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1191 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1192 // Assume inline asm reads the dst
1193 if (Dst)
1194 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1195 IA->readsRegister(Dst->getReg(), &TRI);
1196
1197 if (ProducerMI.isInlineAsm()) {
1198 // If MI is inline asm, assume it has dst forwarding hazard
1199 for (auto &Def : ProducerMI.all_defs()) {
1200 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1201 IA->readsRegister(Def.getReg(), &TRI)) {
1202 return true;
1203 }
1204 }
1205 }
1206
1207 return false;
1208 };
1209
1210 int WaitStatesNeededForDef =
1211 Shift16DefWaitstates -
1212 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1214 }
1215
1216 return WaitStatesNeeded;
1217}
1218
1219int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1220 const SIInstrInfo *TII = ST.getInstrInfo();
1221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1222 const MachineRegisterInfo &MRI = MF.getRegInfo();
1223
1224 const MachineOperand *LaneSelectOp =
1225 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1226
1227 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1228 return 0;
1229
1230 Register LaneSelectReg = LaneSelectOp->getReg();
1231 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1232
1233 const int RWLaneWaitStates = 4;
1234 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1235 RWLaneWaitStates);
1236 return RWLaneWaitStates - WaitStatesSince;
1237}
1238
1239int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1240 if (!ST.hasRFEHazards())
1241 return 0;
1242
1243 const SIInstrInfo *TII = ST.getInstrInfo();
1244
1245 const int RFEWaitStates = 1;
1246
1247 auto IsHazardFn = [TII](const MachineInstr &MI) {
1248 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1249 };
1250 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1251 return RFEWaitStates - WaitStatesNeeded;
1252}
1253
1254int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1255 const SIInstrInfo *TII = ST.getInstrInfo();
1256 const int ReadM0WaitStates = 1;
1257 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1258 return ReadM0WaitStates -
1259 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1260}
1261
1262// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
1263// to insert, negative means not needed.
1264bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
1265 if (WaitStatesNeeded <= 0)
1266 return false;
1267
1268 const SIInstrInfo *TII = ST.getInstrInfo();
1269 for (int I = 0; I < WaitStatesNeeded; ++I)
1270 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1271 TII->get(AMDGPU::V_NOP_e32));
1272
1273 return true;
1274}
1275
1276void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1277 fixVMEMtoScalarWriteHazards(MI);
1278 fixVcmpxPermlaneHazards(MI);
1279 fixSMEMtoVectorWriteHazards(MI);
1280 fixVcmpxExecWARHazard(MI);
1281 fixLdsBranchVmemWARHazard(MI);
1282 if (ST.hasLdsDirect()) {
1283 fixLdsDirectVALUHazard(MI);
1284 fixLdsDirectVMEMHazard(MI);
1285 }
1286 fixVALUPartialForwardingHazard(MI);
1287 fixVALUTransUseHazard(MI);
1288 fixVALUTransCoexecutionHazards(MI);
1289 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1290 emitVNops(MI, checkWMMACoexecutionHazards(MI));
1291 fixShift64HighRegBug(MI);
1292 fixVALUMaskWriteHazard(MI);
1293 fixRequiredExportPriority(MI);
1294 if (ST.requiresWaitIdleBeforeGetReg())
1295 fixGetRegWaitIdle(MI);
1296 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1297 fixDsAtomicAsyncBarrierArriveB64(MI);
1298 if (ST.hasScratchBaseForwardingHazard())
1299 fixScratchBaseForwardingHazard(MI);
1300 if (ST.setRegModeNeedsVNOPs())
1301 fixSetRegMode(MI);
1302}
1303
1305 const MachineInstr &MI) {
1306 return (TII.isVOPC(MI) ||
1307 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1308 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1309}
1310
1311bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1312 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1313 return false;
1314
1315 const SIInstrInfo *TII = ST.getInstrInfo();
1316 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1317 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1318 return isVCmpXWritesExec(*TII, *TRI, MI);
1319 };
1320
1321 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1322 unsigned Opc = MI.getOpcode();
1323 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1324 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1325 };
1326
1327 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1328 std::numeric_limits<int>::max())
1329 return false;
1330
1331 // V_NOP will be discarded by SQ.
1332 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1333 // which is always a VGPR and available.
1334 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1335 Register Reg = Src0->getReg();
1336 bool IsUndef = Src0->isUndef();
1337 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1338 TII->get(AMDGPU::V_MOV_B32_e32))
1339 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1341
1342 return true;
1343}
1344
1345bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1346 if (!ST.hasVMEMtoScalarWriteHazard())
1347 return false;
1348 assert(!ST.hasExtendedWaitCounts());
1349
1351 return false;
1352
1353 if (MI->getNumDefs() == 0)
1354 return false;
1355
1356 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1357
1358 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1360 return false;
1361
1362 for (const MachineOperand &Def : MI->defs()) {
1363 const MachineOperand *Op =
1364 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1365 if (!Op)
1366 continue;
1367 return true;
1368 }
1369 return false;
1370 };
1371
1372 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1373 return SIInstrInfo::isVALU(MI) ||
1374 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1375 !MI.getOperand(0).getImm()) ||
1376 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1377 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1378 };
1379
1380 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1381 std::numeric_limits<int>::max())
1382 return false;
1383
1384 const SIInstrInfo *TII = ST.getInstrInfo();
1385 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1386 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1388 return true;
1389}
1390
1391bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1392 if (!ST.hasSMEMtoVectorWriteHazard())
1393 return false;
1394 assert(!ST.hasExtendedWaitCounts());
1395
1396 if (!SIInstrInfo::isVALU(*MI))
1397 return false;
1398
1399 AMDGPU::OpName SDSTName;
1400 switch (MI->getOpcode()) {
1401 case AMDGPU::V_READLANE_B32:
1402 case AMDGPU::V_READFIRSTLANE_B32:
1403 SDSTName = AMDGPU::OpName::vdst;
1404 break;
1405 default:
1406 SDSTName = AMDGPU::OpName::sdst;
1407 break;
1408 }
1409
1410 const SIInstrInfo *TII = ST.getInstrInfo();
1411 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1412 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1413 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1414 if (!SDST) {
1415 for (const auto &MO : MI->implicit_operands()) {
1416 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1417 SDST = &MO;
1418 break;
1419 }
1420 }
1421 }
1422
1423 if (!SDST)
1424 return false;
1425
1426 const Register SDSTReg = SDST->getReg();
1427 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1428 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1429 };
1430
1431 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1432 if (TII->isSALU(MI)) {
1433 switch (MI.getOpcode()) {
1434 case AMDGPU::S_SETVSKIP:
1435 case AMDGPU::S_VERSION:
1436 case AMDGPU::S_WAITCNT_VSCNT:
1437 case AMDGPU::S_WAITCNT_VMCNT:
1438 case AMDGPU::S_WAITCNT_EXPCNT:
1439 // These instructions cannot not mitigate the hazard.
1440 return false;
1441 case AMDGPU::S_WAITCNT_LGKMCNT:
1442 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1443 return (MI.getOperand(1).getImm() == 0) &&
1444 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1445 case AMDGPU::S_WAITCNT: {
1446 const int64_t Imm = MI.getOperand(0).getImm();
1447 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1448 // DsCnt corresponds to LGKMCnt here.
1449 return (Decoded.DsCnt == 0);
1450 }
1451 default:
1452 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1453 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1454 "unexpected wait count instruction");
1455 // SOPP instructions cannot mitigate the hazard.
1456 if (TII->isSOPP(MI))
1457 return false;
1458 // At this point the SALU can be assumed to mitigate the hazard
1459 // because either:
1460 // (a) it is independent of the at risk SMEM (breaking chain),
1461 // or
1462 // (b) it is dependent on the SMEM, in which case an appropriate
1463 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1464 // SMEM instruction.
1465 return true;
1466 }
1467 }
1468 return false;
1469 };
1470
1471 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1472 std::numeric_limits<int>::max())
1473 return false;
1474
1475 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1476 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1477 .addImm(0);
1478 return true;
1479}
1480
1481bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1482 if (!ST.hasVcmpxExecWARHazard())
1483 return false;
1484 assert(!ST.hasExtendedWaitCounts());
1485
1486 if (!SIInstrInfo::isVALU(*MI))
1487 return false;
1488
1489 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1490 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1491 return false;
1492
1493 auto IsHazardFn = [TRI](const MachineInstr &I) {
1495 return false;
1496 return I.readsRegister(AMDGPU::EXEC, TRI);
1497 };
1498
1499 const SIInstrInfo *TII = ST.getInstrInfo();
1500 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1501 if (SIInstrInfo::isVALU(MI)) {
1502 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1503 return true;
1504 for (auto MO : MI.implicit_operands())
1505 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1506 return true;
1507 }
1508 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1509 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1510 return true;
1511 return false;
1512 };
1513
1514 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1515 std::numeric_limits<int>::max())
1516 return false;
1517
1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1519 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1521 return true;
1522}
1523
1525 const GCNSubtarget &ST) {
1526 if (!ST.hasLdsBranchVmemWARHazard())
1527 return false;
1528
1529 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1530 // instructions need to appear in the same function.
1531 bool HasLds = false;
1532 bool HasVmem = false;
1533 for (auto &MBB : MF) {
1534 for (auto &MI : MBB) {
1536 HasVmem |= SIInstrInfo::isVMEM(MI);
1537 if (HasLds && HasVmem)
1538 return true;
1539 }
1540 }
1541 return false;
1542}
1543
1545 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1546 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1547 !I.getOperand(1).getImm();
1548}
1549
1550bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1551 if (!RunLdsBranchVmemWARHazardFixup)
1552 return false;
1553
1554 assert(ST.hasLdsBranchVmemWARHazard());
1555 assert(!ST.hasExtendedWaitCounts());
1556
1557 auto IsHazardInst = [](const MachineInstr &MI) {
1559 return 1;
1561 return 2;
1562 return 0;
1563 };
1564
1565 auto InstType = IsHazardInst(*MI);
1566 if (!InstType)
1567 return false;
1568
1569 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1570 return IsHazardInst(I) || isStoreCountWaitZero(I);
1571 };
1572
1573 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1574 if (!I.isBranch())
1575 return false;
1576
1577 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1578 auto InstType2 = IsHazardInst(I);
1579 return InstType2 && InstType != InstType2;
1580 };
1581
1582 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1583 auto InstType2 = IsHazardInst(I);
1584 if (InstType == InstType2)
1585 return true;
1586
1587 return isStoreCountWaitZero(I);
1588 };
1589
1590 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1591 std::numeric_limits<int>::max();
1592 };
1593
1594 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1595 std::numeric_limits<int>::max())
1596 return false;
1597
1598 const SIInstrInfo *TII = ST.getInstrInfo();
1599 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1600 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1601 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1602 .addImm(0);
1603
1604 return true;
1605}
1606
1607bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1609 return false;
1610
1611 const int NoHazardWaitStates = 15;
1612 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1613 const Register VDSTReg = VDST->getReg();
1614
1615 bool VisitedTrans = false;
1616 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1617 if (!SIInstrInfo::isVALU(I))
1618 return false;
1619 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1620 // Cover both WAR and WAW
1621 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1622 };
1623 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1624 if (WaitStates >= NoHazardWaitStates)
1625 return true;
1626 // Instructions which cause va_vdst==0 expire hazard
1629 };
1630 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1631 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1632 };
1633
1634 DenseSet<const MachineBasicBlock *> Visited;
1635 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1636 std::next(MI->getReverseIterator()), 0,
1637 IsExpiredFn, Visited, GetWaitStatesFn);
1638
1639 // Transcendentals can execute in parallel to other VALUs.
1640 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1641 if (VisitedTrans)
1642 Count = 0;
1643
1644 MachineOperand *WaitVdstOp =
1645 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1646 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1647
1648 return true;
1649}
1650
1651bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1653 return false;
1654
1655 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1656 const Register VDSTReg = VDST->getReg();
1657
1658 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1660 return false;
1661 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1662 };
1663 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1664 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1665 // according to the type of VMEM instruction.
1666 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1668 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1669 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1670 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1671 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1672 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1673 };
1674
1675 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1676 std::numeric_limits<int>::max())
1677 return false;
1678
1679 if (LdsdirCanWait) {
1680 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1681 } else {
1682 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1683 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1685 }
1686
1687 return true;
1688}
1689
1690bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1691 if (!ST.hasVALUPartialForwardingHazard())
1692 return false;
1693 assert(!ST.hasExtendedWaitCounts());
1694
1695 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1696 return false;
1697
1698 SmallSetVector<Register, 4> SrcVGPRs;
1699
1700 for (const MachineOperand &Use : MI->explicit_uses()) {
1701 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1702 SrcVGPRs.insert(Use.getReg());
1703 }
1704
1705 // Only applies with >= 2 unique VGPR sources
1706 if (SrcVGPRs.size() <= 1)
1707 return false;
1708
1709 // Look for the following pattern:
1710 // Va <- VALU [PreExecPos]
1711 // intv1
1712 // Exec <- SALU [ExecPos]
1713 // intv2
1714 // Vb <- VALU [PostExecPos]
1715 // intv3
1716 // MI Va, Vb (WaitState = 0)
1717 //
1718 // Where:
1719 // intv1 + intv2 <= 2 VALUs
1720 // intv3 <= 4 VALUs
1721 //
1722 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1723
1724 const int Intv1plus2MaxVALUs = 2;
1725 const int Intv3MaxVALUs = 4;
1726 const int IntvMaxVALUs = 6;
1727 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1728
1729 struct StateType {
1730 SmallDenseMap<Register, int, 4> DefPos;
1731 int ExecPos = std::numeric_limits<int>::max();
1732 int VALUs = 0;
1733
1734 static unsigned getHashValue(const StateType &State) {
1735 return hash_combine(State.ExecPos, State.VALUs,
1736 hash_combine_range(State.DefPos));
1737 }
1738 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1739 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1740 LHS.VALUs == RHS.VALUs;
1741 }
1742 };
1743
1744 StateType State;
1745
1746 // This overloads expiry testing with all the hazard detection
1747 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1748 // Too many VALU states have passed
1749 if (State.VALUs > NoHazardVALUWaitStates)
1750 return HazardExpired;
1751
1752 // Instructions which cause va_vdst==0 expire hazard
1755 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1756 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1757 return HazardExpired;
1758
1759 // Track registers writes
1760 bool Changed = false;
1761 if (SIInstrInfo::isVALU(I)) {
1762 for (Register Src : SrcVGPRs) {
1763 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1764 State.DefPos[Src] = State.VALUs;
1765 Changed = true;
1766 }
1767 }
1768 } else if (SIInstrInfo::isSALU(I)) {
1769 if (State.ExecPos == std::numeric_limits<int>::max()) {
1770 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1771 State.ExecPos = State.VALUs;
1772 Changed = true;
1773 }
1774 }
1775 }
1776
1777 // Early expiration: too many VALUs in intv3
1778 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1779 return HazardExpired;
1780
1781 // Only evaluate state if something changed
1782 if (!Changed)
1783 return NoHazardFound;
1784
1785 // Determine positions of VALUs pre/post exec change
1786 if (State.ExecPos == std::numeric_limits<int>::max())
1787 return NoHazardFound;
1788
1789 int PreExecPos = std::numeric_limits<int>::max();
1790 int PostExecPos = std::numeric_limits<int>::max();
1791
1792 for (auto Entry : State.DefPos) {
1793 int DefVALUs = Entry.second;
1794 if (DefVALUs != std::numeric_limits<int>::max()) {
1795 if (DefVALUs >= State.ExecPos)
1796 PreExecPos = std::min(PreExecPos, DefVALUs);
1797 else
1798 PostExecPos = std::min(PostExecPos, DefVALUs);
1799 }
1800 }
1801
1802 // Need a VALUs post exec change
1803 if (PostExecPos == std::numeric_limits<int>::max())
1804 return NoHazardFound;
1805
1806 // Too many VALUs in intv3?
1807 int Intv3VALUs = PostExecPos;
1808 if (Intv3VALUs > Intv3MaxVALUs)
1809 return HazardExpired;
1810
1811 // Too many VALUs in intv2?
1812 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1813 if (Intv2VALUs > Intv1plus2MaxVALUs)
1814 return HazardExpired;
1815
1816 // Need a VALUs pre exec change
1817 if (PreExecPos == std::numeric_limits<int>::max())
1818 return NoHazardFound;
1819
1820 // Too many VALUs in intv1?
1821 int Intv1VALUs = PreExecPos - State.ExecPos;
1822 if (Intv1VALUs > Intv1plus2MaxVALUs)
1823 return HazardExpired;
1824
1825 // Too many VALUs in intv1 + intv2
1826 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1827 return HazardExpired;
1828
1829 return HazardFound;
1830 };
1831 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1833 State.VALUs += 1;
1834 };
1835
1836 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1837 std::next(MI->getReverseIterator())))
1838 return false;
1839
1840 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1841 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1843
1844 return true;
1845}
1846
1847bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1848 if (!ST.hasVALUTransUseHazard())
1849 return false;
1850 assert(!ST.hasExtendedWaitCounts());
1851
1852 if (!SIInstrInfo::isVALU(*MI))
1853 return false;
1854
1855 SmallSet<Register, 4> SrcVGPRs;
1856
1857 for (const MachineOperand &Use : MI->explicit_uses()) {
1858 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1859 SrcVGPRs.insert(Use.getReg());
1860 }
1861
1862 // Look for the following pattern:
1863 // Va <- TRANS VALU
1864 // intv
1865 // MI Va (WaitState = 0)
1866 //
1867 // Where:
1868 // intv <= 5 VALUs / 1 TRANS
1869 //
1870 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1871
1872 const int IntvMaxVALUs = 5;
1873 const int IntvMaxTRANS = 1;
1874
1875 struct StateType {
1876 int VALUs = 0;
1877 int TRANS = 0;
1878
1879 static unsigned getHashValue(const StateType &State) {
1880 return hash_combine(State.VALUs, State.TRANS);
1881 }
1882 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1883 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1884 }
1885 };
1886
1887 StateType State;
1888
1889 // This overloads expiry testing with all the hazard detection
1890 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1891 // Too many VALU states have passed
1892 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1893 return HazardExpired;
1894
1895 // Instructions which cause va_vdst==0 expire hazard
1898 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1899 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1900 return HazardExpired;
1901
1902 // Track registers writes
1903 if (SIInstrInfo::isTRANS(I)) {
1904 for (Register Src : SrcVGPRs) {
1905 if (I.modifiesRegister(Src, &TRI)) {
1906 return HazardFound;
1907 }
1908 }
1909 }
1910
1911 return NoHazardFound;
1912 };
1913 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1915 State.VALUs += 1;
1917 State.TRANS += 1;
1918 };
1919
1920 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1921 std::next(MI->getReverseIterator())))
1922 return false;
1923
1924 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1925 // avoided.
1926 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1927 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1929
1930 return true;
1931}
1932
1933bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1934 if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1936 return false;
1937
1938 const SIInstrInfo *TII = ST.getInstrInfo();
1939 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1940
1941 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1942 if (!SIInstrInfo::isTRANS(I))
1943 return false;
1944
1945 // RAW: Trans(I) writes, VALU(MI) reads.
1946 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1947 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1948 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1949 return true;
1950 }
1951
1952 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1953 if (!ValuDst || !ValuDst->isReg())
1954 return false;
1955
1956 // WAR: Trans(I) reads, VALU(MI) writes.
1957 Register ValuDef = ValuDst->getReg();
1958 for (const MachineOperand &TransUse : I.explicit_uses()) {
1959 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1960 return true;
1961 }
1962
1963 return false;
1964 };
1965
1966 auto IsExpiredFn = [](const MachineInstr &I, int) {
1967 return SIInstrInfo::isVALU(I);
1968 };
1969
1970 const int HasVALU = std::numeric_limits<int>::max();
1971 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1972 return false;
1973
1974 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1975 return true;
1976}
1977
1978bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1980 return false;
1981
1982 const SIInstrInfo *TII = ST.getInstrInfo();
1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1984
1985 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1987 return false;
1988
1989 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1990 // with the dest(matrix D) of the previous wmma.
1991 const Register CurSrc0Reg =
1992 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1993 const Register CurSrc1Reg =
1994 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1995
1996 const Register PrevDstReg =
1997 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1998
1999 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2000 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2001 return true;
2002 }
2003
2004 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2005 // but Index can't overlap with PrevDstReg.
2006 if (AMDGPU::isGFX12Plus(ST)) {
2007 if (SIInstrInfo::isSWMMAC(*MI)) {
2008 const Register CurIndex =
2009 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2010 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2011 return true;
2012 }
2013 return false;
2014 }
2015
2016 return false;
2017 };
2018
2019 auto IsExpiredFn = [](const MachineInstr &I, int) {
2020 return SIInstrInfo::isVALU(I);
2021 };
2022
2023 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2024 std::numeric_limits<int>::max())
2025 return false;
2026
2027 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2028
2029 return true;
2030}
2031
2034 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2035}
2036
2038 const SIInstrInfo *TII, unsigned Latency,
2039 unsigned Category) {
2040 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2041 "Handle me if the xdl wmma instruction latency changes");
2042
2043 switch (Category) {
2044 case 0: // Dense WMMA Instructions:
2045 // WMMA_*F16, WMMA_*BF16
2046 // WMMA_*FP8FP8
2047 // WMMA_*FP8BF8
2048 // WMMA_*BF8FP8
2049 // WMMA_*BF8BF8
2050 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2051 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2052
2053 case 1: // Dense WMMA Instructions:
2054 // WMMA_IU8
2055 // WMMA_IU4
2056 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2057 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2058
2059 case 2: // Dense SWMMAC Instructions
2060 // SWMMAC_*F16, SWMMAC_*BF16,
2061 // SWMMAC_*FP8FP8
2062 // SWMMAC_*BF8FP8
2063 // SWMMAC_*FP8BF8
2064 // SWMMAC_*BF8BF8
2065 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2066
2067 case 3: // Sparse WMMA Instructions:
2068 // SWMMAC_IU8
2069 // SWMMAC_IU4
2070 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2071 default:
2072 break;
2073 } // end switch.
2074
2075 return false;
2076}
2077
2078int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
2079 if (!AMDGPU::isGFX1250(ST))
2080 return 0;
2081
2082 const SIInstrInfo *TII = ST.getInstrInfo();
2083 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2084 return 0;
2085
2086 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2087
2088 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2089 // be in between the first WMMA and the second instruction to cover the hazard
2090 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2091 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2092 // numbers, which depends on the category of the first WMMA.
2093 const int WMMAWaitStates[] = {5, 9, 3, 5};
2094 const int VALUWaitStates[] = {4, 8, 2, 4};
2095 unsigned Category = 0;
2096
2097 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2098 if (!TII->isXDLWMMA(I))
2099 return false;
2100
2101 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2102 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2103 return false;
2104
2105 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2106 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2107 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2108
2109 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2110 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2111 return true;
2112
2113 if (SIInstrInfo::isSWMMAC(*MI)) {
2114 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2115 if (TRI->regsOverlap(D0, Idx1))
2116 return true;
2117 }
2118
2119 return false;
2120 };
2121
2122 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2123 if (!TII->isXDLWMMA(I))
2124 return false;
2125
2126 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2127 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2128 return false;
2129
2130 // WMMA writes, VALU reads.
2131 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2132 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2133 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2134 return true;
2135 }
2136
2137 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2138 if (!ValuDst || !ValuDst->isReg())
2139 return false;
2140 Register D1 = ValuDst->getReg();
2141
2142 // WMMA writes, VALU writes.
2143 if (TRI->regsOverlap(D0, D1))
2144 return true;
2145
2146 // WMMA reads, VALU writes.
2147 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2148 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2149 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2150 return true;
2151
2152 if (SIInstrInfo::isSWMMAC(I)) {
2153 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2154 if (TRI->regsOverlap(D1, Idx0))
2155 return true;
2156 }
2157
2158 return false;
2159 };
2160
2161 int Limit = 0;
2162
2163 auto GetWaitStatesFn = [](const MachineInstr &I) {
2164 return SIInstrInfo::isVALU(I) ? 1 : 0;
2165 };
2166
2167 int WaitStatesNeeded = -1;
2168 if (TII->isXDLWMMA(*MI)) {
2169 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2170 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2171 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2172 // exists, and INT_MAX if there is no hazard. As a result, a negative
2173 // WaitStatesNeeded here means no hazard, and we will continue to search
2174 // for other categories.
2175 WaitStatesNeeded =
2176 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2177 }
2178 } else { // Must be a co-executable VALU.
2179 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2180 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2181 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2182 // exists, and INT_MAX if there is no hazard. As a result, a negative
2183 // WaitStatesNeeded here means no hazard, and we will continue to search
2184 // for other categories.
2185 WaitStatesNeeded =
2186 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2187 }
2188 }
2189
2190 return WaitStatesNeeded;
2191}
2192
2193bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2194 if (!ST.hasShift64HighRegBug())
2195 return false;
2196 assert(!ST.hasExtendedWaitCounts());
2197
2198 switch (MI->getOpcode()) {
2199 default:
2200 return false;
2201 case AMDGPU::V_LSHLREV_B64_e64:
2202 case AMDGPU::V_LSHRREV_B64_e64:
2203 case AMDGPU::V_ASHRREV_I64_e64:
2204 break;
2205 }
2206
2207 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2208 if (!Amt->isReg())
2209 return false;
2210
2211 Register AmtReg = Amt->getReg();
2212 const MachineRegisterInfo &MRI = MF.getRegInfo();
2213 // Check if this is a last VGPR in the allocation block.
2214 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2215 return false;
2216
2217 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2218 return false;
2219
2220 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2221 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
2222 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
2223 bool Overlapped = OverlappedSrc || OverlappedDst;
2224
2225 assert(!OverlappedDst || !OverlappedSrc ||
2226 Src1->getReg() == MI->getOperand(0).getReg());
2227 assert(ST.needsAlignedVGPRs());
2228 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2229
2230 Register NewReg;
2231 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2232 : AMDGPU::VGPR_32RegClass) {
2233 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2234 NewReg = Reg;
2235 break;
2236 }
2237 }
2238
2239 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2240 : NewReg;
2241 Register NewAmtLo;
2242
2243 if (Overlapped)
2244 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2245
2246 DebugLoc DL = MI->getDebugLoc();
2247 MachineBasicBlock *MBB = MI->getParent();
2248 // Insert a full wait count because found register might be pending a wait.
2249 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2250 .addImm(0);
2251
2252 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2253 if (Overlapped)
2254 runOnInstruction(
2255 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2256 .addDef(AmtReg - 1)
2257 .addReg(AmtReg - 1, RegState::Undef)
2258 .addReg(NewAmtLo, RegState::Undef));
2259 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2260 .addDef(AmtReg)
2261 .addReg(AmtReg, RegState::Undef)
2262 .addReg(NewAmt, RegState::Undef));
2263
2264 // Instructions emitted after the current instruction will be processed by the
2265 // parent loop of the hazard recognizer in a natural way.
2266 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2267 AmtReg)
2268 .addDef(NewAmt)
2269 .addReg(NewAmt)
2270 .addReg(AmtReg);
2271 if (Overlapped)
2272 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2273 AmtReg - 1)
2274 .addDef(NewAmtLo)
2275 .addReg(NewAmtLo)
2276 .addReg(AmtReg - 1);
2277
2278 // Re-running hazard recognizer on the modified instruction is not necessary,
2279 // inserted V_SWAP_B32 has already both read and write new registers so
2280 // hazards related to these register has already been handled.
2281 Amt->setReg(NewAmt);
2282 Amt->setIsKill(false);
2283 // We do not update liveness, so verifier may see it as undef.
2284 Amt->setIsUndef();
2285 if (OverlappedDst)
2286 MI->getOperand(0).setReg(NewReg);
2287 if (OverlappedSrc) {
2288 Src1->setReg(NewReg);
2289 Src1->setIsKill(false);
2290 Src1->setIsUndef();
2291 }
2292
2293 return true;
2294}
2295
2296int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2297 int NSAtoVMEMWaitStates = 1;
2298
2299 if (!ST.hasNSAtoVMEMBug())
2300 return 0;
2301
2303 return 0;
2304
2305 const SIInstrInfo *TII = ST.getInstrInfo();
2306 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2307 if (!Offset || (Offset->getImm() & 6) == 0)
2308 return 0;
2309
2310 auto IsHazardFn = [TII](const MachineInstr &I) {
2311 if (!SIInstrInfo::isMIMG(I))
2312 return false;
2313 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2314 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2315 TII->getInstSizeInBytes(I) >= 16;
2316 };
2317
2318 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2319}
2320
2321int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2322 int FPAtomicToDenormModeWaitStates = 3;
2323
2324 if (!ST.hasFPAtomicToDenormModeHazard())
2325 return 0;
2326 assert(!ST.hasExtendedWaitCounts());
2327
2328 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2329 return 0;
2330
2331 auto IsHazardFn = [](const MachineInstr &I) {
2332 if (!SIInstrInfo::isVMEM(I))
2333 return false;
2334 return SIInstrInfo::isFPAtomic(I);
2335 };
2336
2337 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2338 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2339 return true;
2340
2341 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2342 };
2343
2344 return FPAtomicToDenormModeWaitStates -
2345 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2346}
2347
2348int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2350
2351 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2352}
2353
2354int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2355 // Early exit if no padding is requested.
2356 if (MFMAPaddingRatio == 0)
2357 return 0;
2358
2359 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2360 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2361 return 0;
2362
2363 int NeighborMFMALatency = 0;
2364 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2365 this](const MachineInstr &MI) {
2366 if (!SIInstrInfo::isMFMA(MI))
2367 return false;
2368
2369 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2370 return true;
2371 };
2372
2373 const int MaxMFMAPipelineWaitStates = 16;
2374 int WaitStatesSinceNeighborMFMA =
2375 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2376
2377 int NeighborMFMAPaddingNeeded =
2378 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2379 WaitStatesSinceNeighborMFMA;
2380
2381 return std::max(0, NeighborMFMAPaddingNeeded);
2382}
2383
2384int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2385 int WaitStatesNeeded = 0;
2386 unsigned Opc = MI->getOpcode();
2387
2388 auto IsVALUFn = [](const MachineInstr &MI) {
2389 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2390 };
2391
2392 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2393 const int LegacyVALUWritesVGPRWaitStates = 2;
2394 const int VALUWritesExecWaitStates = 4;
2395 const int MaxWaitStates = 4;
2396
2397 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2398 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2399 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2400
2401 if (WaitStatesNeeded < MaxWaitStates) {
2402 for (const MachineOperand &Use : MI->explicit_uses()) {
2403 const int MaxWaitStates = 2;
2404
2405 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2406 continue;
2407
2408 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2409 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2411
2412 if (WaitStatesNeeded == MaxWaitStates)
2413 break;
2414 }
2415 }
2416 }
2417
2418 for (const MachineOperand &Op : MI->explicit_operands()) {
2419 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2420 continue;
2421
2422 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2423 continue;
2424
2425 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2426 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2427 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2428 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2429 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2430 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2431 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2432 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2433 const int MaxWaitStates = 18;
2434 Register Reg = Op.getReg();
2435 unsigned HazardDefLatency = 0;
2436
2437 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2438 this](const MachineInstr &MI) {
2439 if (!SIInstrInfo::isMFMA(MI))
2440 return false;
2441 Register DstReg = MI.getOperand(0).getReg();
2442 if (DstReg == Reg)
2443 return false;
2444 HazardDefLatency =
2445 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2446 return TRI.regsOverlap(DstReg, Reg);
2447 };
2448
2449 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2450 MaxWaitStates);
2451 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2452 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2453 int OpNo = Op.getOperandNo();
2454 if (OpNo == SrcCIdx) {
2455 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2456 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2457 switch (HazardDefLatency) {
2458 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2459 break;
2460 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2461 break;
2462 case 16: [[fallthrough]];
2463 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2464 break;
2465 }
2466 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2467 switch (HazardDefLatency) {
2468 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2469 break;
2470 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2471 break;
2472 case 16: [[fallthrough]];
2473 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2474 break;
2475 }
2476 }
2477
2478 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2479 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2480
2481 if (WaitStatesNeeded == MaxWaitStates)
2482 return WaitStatesNeeded; // Early exit.
2483
2484 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2485 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2486 return false;
2487 Register DstReg = MI.getOperand(0).getReg();
2488 return TRI.regsOverlap(Reg, DstReg);
2489 };
2490
2491 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2492 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2493 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2494 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2495 if (OpNo == SrcCIdx)
2496 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2497 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2498 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2499
2500 WaitStatesNeededForUse = NeedWaitStates -
2501 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2502 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2503
2504 if (WaitStatesNeeded == MaxWaitStates)
2505 return WaitStatesNeeded; // Early exit.
2506 }
2507
2508 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2509 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2510 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2511 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2512 const int MaxWaitStates = 13;
2513 Register DstReg = MI->getOperand(0).getReg();
2514 unsigned HazardDefLatency = 0;
2515
2516 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2517 this](const MachineInstr &MI) {
2518 if (!SIInstrInfo::isMFMA(MI))
2519 return false;
2520 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2521 HazardDefLatency =
2522 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2523 return TRI.regsOverlap(Reg, DstReg);
2524 };
2525
2526 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2527 int NeedWaitStates;
2528 switch (HazardDefLatency) {
2529 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2530 break;
2531 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2532 break;
2533 case 16: [[fallthrough]];
2534 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2535 break;
2536 }
2537
2538 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2539 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2540 }
2541
2542 // Pad neighboring MFMA with noops for better inter-wave performance.
2543 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2544
2545 return WaitStatesNeeded;
2546}
2547
2548static int
2550 bool IsGFX950) {
2551 // xdl def cycles | gfx940 | gfx950
2552 // 2 pass | 3 4
2553 // 4 pass | 5 6
2554 // 8 pass | 9 10
2555 // 16 pass | 17 18
2556 return NumPasses + 1 + IsGFX950;
2557}
2558
2559static int
2561 bool IsGFX950) {
2562 // xdl def cycles | gfx940 | gfx950
2563 // 2 pass | 3 3
2564 // 4 pass | 5 6
2565 // 8 pass | 9 10
2566 // 16 pass | 17 18
2567 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2568}
2569
2570static int
2572 // 2 pass -> 2
2573 // 4 pass -> 4
2574 // 8 pass -> 8
2575 // 16 pass -> 16
2576 return NumPasses;
2577}
2578
2579static int
2581 // 2 pass -> 4
2582 // 4 pass -> 6
2583 // 8 pass -> 10
2584 // 16 pass -> 18
2585 return NumPasses + 2;
2586}
2587
2589 bool IsGFX950) {
2590 // xdl def cycles | gfx942 | gfx950
2591 // 2 pass | 5 5
2592 // 4 pass | 7 8
2593 // 8 pass | 11 12
2594 // 16 pass | 19 20
2595 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2596}
2597
2598int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2599 int WaitStatesNeeded = 0;
2600 unsigned Opc = MI->getOpcode();
2601
2602 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2604 };
2605
2606 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2609 };
2610
2611 if (!SIInstrInfo::isMFMA(*MI))
2612 return WaitStatesNeeded;
2613
2614 const int VALUWritesExecWaitStates = 4;
2615 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2616 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2617 VALUWritesExecWaitStates);
2618 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2619
2620 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2621
2622 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2623 for (const MachineOperand &Use : MI->explicit_uses()) {
2624 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2625 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2626 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2627 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2628 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2629 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2630 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2631 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2632 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2633 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2634 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2635 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2636 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2637 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2638 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2639 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2640 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2641 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2642 const int MaxWaitStates = 19;
2643
2644 if (!Use.isReg())
2645 continue;
2646 Register Reg = Use.getReg();
2647 bool FullReg;
2648 const MachineInstr *MI1;
2649
2650 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2651 this](const MachineInstr &MI) {
2652 if (!SIInstrInfo::isMFMA(MI))
2653 return false;
2654 Register DstReg = MI.getOperand(0).getReg();
2655 FullReg = (DstReg == Reg);
2656 MI1 = &MI;
2657 return TRI.regsOverlap(DstReg, Reg);
2658 };
2659
2660 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2661 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2662 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2663
2664 int NumWaitStates =
2665 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2666 if (NumWaitStates == std::numeric_limits<int>::max())
2667 continue;
2668
2669 int OpNo = Use.getOperandNo();
2670 unsigned Opc1 = MI1->getOpcode();
2671 int NeedWaitStates = 0;
2672 if (OpNo == SrcCIdx) {
2673 if (!SIInstrInfo::isDGEMM(Opc) &&
2674 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2675 NeedWaitStates = 0;
2676 } else if (FullReg) {
2677 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2678 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2679 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2680 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2681 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2682 else if (ST.hasGFX940Insts() &&
2683 TSchedModel.computeInstrLatency(MI1) == 2)
2684 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2685 } else {
2686 switch (Opc1) {
2687 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2688 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2689 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2690 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2691 if (!TII.isXDL(*MI))
2692 NeedWaitStates =
2693 ST.hasGFX950Insts()
2694 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2695 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2696 break;
2697 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2698 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2699 if (!TII.isXDL(*MI))
2700 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2701 break;
2702 default:
2703 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2704 if (ST.hasGFX940Insts()) {
2705 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2706 break;
2707
2708 NeedWaitStates =
2709 TII.isXDL(*MI1)
2710 ? (TII.isXDL(*MI)
2712 NumPasses, ST.hasGFX950Insts())
2714 NumPasses, ST.hasGFX950Insts()))
2716 NumPasses);
2717 break;
2718 }
2719
2720 switch (NumPasses) {
2721 case 2:
2722 NeedWaitStates =
2724 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2725 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2726 break;
2727 case 8:
2728 NeedWaitStates =
2730 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2731 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2732 break;
2733 case 16:
2734 NeedWaitStates =
2736 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2737 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2738 break;
2739 default:
2740 llvm_unreachable("unexpected number of passes");
2741 }
2742 }
2743 }
2744 } else {
2745 switch (Opc1) {
2746 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2747 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2748 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2749 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2750 NeedWaitStates =
2751 ST.hasGFX950Insts()
2752 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2753 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2754 break;
2755 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2756 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2757 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2758 break;
2759 default:
2760 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2761
2762 if (ST.hasGFX940Insts()) {
2763 NeedWaitStates =
2764 TII.isXDL(*MI1)
2766 NumPasses, ST.hasGFX950Insts())
2768 NumPasses);
2769 break;
2770 }
2771
2772 switch (NumPasses) {
2773 case 2:
2774 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2775 break;
2776 case 4:
2777 llvm_unreachable("unexpected number of passes for mfma");
2778 case 8:
2779 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2780 break;
2781 case 16:
2782 default:
2783 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2784 }
2785 }
2786 }
2787 if (WaitStatesNeeded >= NeedWaitStates)
2788 continue;
2789
2790 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2792
2793 if (WaitStatesNeeded == MaxWaitStates)
2794 break;
2795 }
2796
2797 // Pad neighboring MFMA with noops for better inter-wave performance.
2798 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2799
2800 return WaitStatesNeeded;
2801}
2802
2803int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2804 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2805 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2806 return 0;
2807
2808 int WaitStatesNeeded = 0;
2809
2810 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2811 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2812 };
2813
2814 for (const MachineOperand &Op : MI->explicit_uses()) {
2815 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2816 continue;
2817
2818 Register Reg = Op.getReg();
2819
2820 const int AccVgprReadLdStWaitStates = 2;
2821 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2822 const int MaxWaitStates = 2;
2823
2824 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2825 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2827
2828 if (WaitStatesNeeded == MaxWaitStates)
2829 return WaitStatesNeeded; // Early exit.
2830
2831 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2832 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2833 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2834 return false;
2835 auto IsVALUFn = [](const MachineInstr &MI) {
2837 };
2838 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2839 std::numeric_limits<int>::max();
2840 };
2841
2842 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2843 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2844 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2845 }
2846
2847 return WaitStatesNeeded;
2848}
2849
2850int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2851 assert(!ST.hasVcmpxPermlaneHazard() &&
2852 "this is a different vcmpx+permlane hazard");
2853 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2854 const SIInstrInfo *TII = ST.getInstrInfo();
2855
2856 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2857 return isVCmpXWritesExec(*TII, *TRI, MI);
2858 };
2859
2860 auto IsVALUFn = [](const MachineInstr &MI) {
2861 return SIInstrInfo::isVALU(MI);
2862 };
2863
2864 const int VCmpXWritesExecWaitStates = 4;
2865 const int VALUWritesVDstWaitStates = 2;
2866 int WaitStatesNeeded = 0;
2867
2868 for (const MachineOperand &Op : MI->explicit_uses()) {
2869 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2870 continue;
2871 Register Reg = Op.getReg();
2872
2873 int WaitStatesSinceDef =
2874 VALUWritesVDstWaitStates -
2875 getWaitStatesSinceDef(Reg, IsVALUFn,
2876 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2877 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2878 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2879 break;
2880 }
2881
2882 int VCmpXHazardWaits =
2883 VCmpXWritesExecWaitStates -
2884 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2885
2886 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2887 return WaitStatesNeeded;
2888}
2889
2891 // 2 pass -> 4
2892 // 4 pass -> 6
2893 // 8 pass -> 10
2894 // 16 pass -> 18
2895 return NumPasses + 2;
2896}
2897
2899 bool IsGFX950) {
2900 // xdl def cycles | gfx942 | gfx950
2901 // 2 pass | 5 5
2902 // 4 pass | 7 8
2903 // 8 pass | 11 12
2904 // 16 pass | 19 20
2905 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2906}
2907
2909 bool IsGFX950) {
2910 // xdl def cycles | gfx942 | gfx950
2911 // 2 pass | 5 5
2912 // 4 pass | 7 8
2913 // 8 pass | 11 12
2914 // 16 pass | 19 20
2915 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2916}
2917
2919 // 2 pass -> 4
2920 // 4 pass -> 6
2921 // 8 pass -> 10
2922 // 16 pass -> 18
2923 return NumPasses + 2;
2924}
2925
2926int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2927 if (!ST.hasGFX90AInsts())
2928 return 0;
2929
2930 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2931 return SIInstrInfo::isDGEMM(MI.getOpcode());
2932 };
2933
2934 // This is checked in checkMAIHazards90A()
2935 if (SIInstrInfo::isMFMA(*MI))
2936 return 0;
2937
2938 const MachineRegisterInfo &MRI = MF.getRegInfo();
2939
2940 int WaitStatesNeeded = 0;
2941
2942 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2943 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2944 bool IsVALU = SIInstrInfo::isVALU(*MI);
2945
2946 const MachineInstr *MFMA = nullptr;
2947 unsigned Reg;
2948 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2949 if (!SIInstrInfo::isMFMA(MI) ||
2950 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2951 return false;
2952 MFMA = &MI;
2953 return true;
2954 };
2955
2956 const MachineInstr *DOT = nullptr;
2957 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2958 if (!SIInstrInfo::isDOT(MI) ||
2959 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2960 return false;
2961 DOT = &MI;
2962 return true;
2963 };
2964
2965 bool DGEMMAfterVALUWrite = false;
2966 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2967 // Found DGEMM on reverse traversal to def.
2968 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2969 DGEMMAfterVALUWrite = true;
2970
2971 // Only hazard if register is defined by a VALU and a DGEMM is found after
2972 // after the def.
2973 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2974 return false;
2975
2976 return true;
2977 };
2978
2979 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2980 AMDGPU::OpName::src2);
2981
2982 if (IsMemOrExport || IsVALU) {
2983 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2984 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2985 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2986 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2987 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2988 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2989 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2990 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2991 const int DotWriteSameDotReadSrcAB = 3;
2992 const int DotWriteDifferentVALURead = 3;
2993 const int DMFMABetweenVALUWriteVMEMRead = 2;
2994 const int MaxWaitStates = 19;
2995
2996 for (const MachineOperand &Use : MI->explicit_uses()) {
2997 if (!Use.isReg())
2998 continue;
2999 Reg = Use.getReg();
3000
3001 DOT = nullptr;
3002 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3003 MaxWaitStates);
3004 if (DOT) {
3005 int NeedWaitStates = 0;
3006 if (DOT->getOpcode() == MI->getOpcode()) {
3007 if (&Use - &MI->getOperand(0) != SrcCIdx)
3008 NeedWaitStates = DotWriteSameDotReadSrcAB;
3009 } else {
3010 NeedWaitStates = DotWriteDifferentVALURead;
3011 }
3012
3013 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3014 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3015 }
3016
3017 // Workaround for HW data hazard bug observed only in GFX90A. When there
3018 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3019 // causes the SQ to incorrectly not insert two wait states between the two
3020 // instructions needed to avoid data hazard.
3021 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3022 DGEMMAfterVALUWrite = false;
3023 if (TRI.isVectorRegister(MRI, Reg)) {
3024 int WaitStatesNeededForUse =
3025 DMFMABetweenVALUWriteVMEMRead -
3026 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3027 DMFMABetweenVALUWriteVMEMRead);
3028
3029 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3030 }
3031 }
3032
3033 MFMA = nullptr;
3034 WaitStatesSinceDef =
3035 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3036 if (!MFMA)
3037 continue;
3038
3039 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3040 int NumPasses = HazardDefLatency;
3041 int NeedWaitStates = MaxWaitStates;
3042
3043 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3044 switch (HazardDefLatency) {
3045 case 4:
3046 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3047 : DMFMA4x4WriteVgprVALUReadWaitStates;
3048 break;
3049 case 8:
3050 case 16:
3051 NeedWaitStates =
3052 IsMemOrExport
3053 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3054 : (ST.hasGFX950Insts()
3055 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3056 : DMFMA16x16WriteVgprVALUReadWaitStates);
3057 break;
3058 default:
3059 llvm_unreachable("unexpected dgemm");
3060 }
3061 } else if (ST.hasGFX940Insts()) {
3062 NeedWaitStates =
3063 TII.isXDL(*MFMA)
3065 NumPasses, ST.hasGFX950Insts())
3067 NumPasses);
3068 } else {
3069 switch (HazardDefLatency) {
3070 case 2:
3071 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3072 break;
3073 case 8:
3074 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3075 break;
3076 case 16:
3077 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3078 break;
3079 default:
3080 llvm_unreachable("unexpected number of passes for mfma");
3081 }
3082 }
3083
3084 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3086
3087 if (WaitStatesNeeded == MaxWaitStates)
3088 break;
3089 }
3090 }
3091
3092 unsigned Opc = MI->getOpcode();
3093 const int DMFMAToFMA64WaitStates = 2;
3094 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3095 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3096 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3097 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3098 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3099 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3101 }
3102
3103 if (!IsVALU && !IsMemOrExport)
3104 return WaitStatesNeeded;
3105
3106 for (const MachineOperand &Def : MI->defs()) {
3107 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3108 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3109 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3110 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3111 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3112 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3113 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3114 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3115 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3116 const int DotWriteDifferentVALUWrite = 3;
3117 const int MaxWaitStates = 19;
3118 const int MaxWarWaitStates = 15;
3119
3120 Reg = Def.getReg();
3121
3122 DOT = nullptr;
3123 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3124 MaxWaitStates);
3125 if (DOT && DOT->getOpcode() != MI->getOpcode())
3126 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3127 WaitStatesSinceDef);
3128
3129 MFMA = nullptr;
3130 WaitStatesSinceDef =
3131 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3132 if (MFMA) {
3133 int NeedWaitStates = MaxWaitStates;
3134 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3135
3136 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3137 switch (NumPasses) {
3138 case 4:
3139 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3140 break;
3141 case 8:
3142 case 16:
3143 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3144 break;
3145 default:
3146 llvm_unreachable("unexpected number of cycles for dgemm");
3147 }
3148 } else if (ST.hasGFX940Insts()) {
3149 NeedWaitStates =
3150 TII.isXDL(*MFMA)
3152 NumPasses, ST.hasGFX950Insts())
3154 } else {
3155 switch (NumPasses) {
3156 case 2:
3157 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3158 break;
3159 case 8:
3160 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3161 break;
3162 case 16:
3163 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3164 break;
3165 default:
3166 llvm_unreachable("Unexpected number of passes for mfma");
3167 }
3168 }
3169
3170 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3171 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3172
3173 if (WaitStatesNeeded == MaxWaitStates)
3174 break;
3175 }
3176
3177 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3178 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3179 !MI.readsRegister(Reg, &TRI))
3180 return false;
3181
3182 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3183 return false;
3184
3185 const MachineOperand *SrcC =
3186 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3187 assert(SrcC);
3188 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3189 return false;
3190
3191 MFMA = &MI;
3192 return true;
3193 };
3194
3195 MFMA = nullptr;
3196 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3197 MaxWarWaitStates);
3198 if (!MFMA)
3199 continue;
3200
3201 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3202 int NeedWaitStates = MaxWaitStates;
3203 switch (HazardDefLatency) {
3204 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3205 break;
3206 case 4: assert(ST.hasGFX940Insts());
3207 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3208 break;
3209 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3210 break;
3211 case 16: [[fallthrough]];
3212 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3213 break;
3214 }
3215
3216 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3217 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3218 }
3219
3220 return WaitStatesNeeded;
3221}
3222
3224 if (!SU->isInstr())
3225 return false;
3226
3227 const MachineInstr *MAI = nullptr;
3228
3229 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3230 MAI = nullptr;
3232 MAI = &MI;
3233 return MAI != nullptr;
3234 };
3235
3236 MachineInstr *MI = SU->getInstr();
3237 if (IsMFMAFn(*MI)) {
3238 int W = getWaitStatesSince(IsMFMAFn, 16);
3239 if (MAI)
3240 return W < (int)TSchedModel.computeInstrLatency(MAI);
3241 }
3242
3243 return false;
3244}
3245
3246// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3247// insertion of a new instruction.
3248static void updateGetPCBundle(MachineInstr *NewMI) {
3249 if (!NewMI->isBundled())
3250 return;
3251
3252 // Find start of bundle.
3253 auto I = NewMI->getIterator();
3254 while (I->isBundledWithPred())
3255 I--;
3256 if (I->isBundle())
3257 I++;
3258
3259 // Bail if this is not an S_GETPC bundle.
3260 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3261 return;
3262
3263 // Update offsets of any references in the bundle.
3264 const unsigned NewBytes = 4;
3265 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3266 "Unexpected instruction insertion in bundle");
3267 auto NextMI = std::next(NewMI->getIterator());
3268 auto End = NewMI->getParent()->end();
3269 while (NextMI != End && NextMI->isBundledWithPred()) {
3270 for (auto &Operand : NextMI->operands()) {
3271 if (Operand.isGlobal())
3272 Operand.setOffset(Operand.getOffset() + NewBytes);
3273 }
3274 NextMI++;
3275 }
3276}
3277
3278bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3279 if (!ST.hasVALUMaskWriteHazard())
3280 return false;
3281 assert(!ST.hasExtendedWaitCounts());
3282
3283 if (!ST.isWave64())
3284 return false;
3285
3286 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3287 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3288 if (!IsSALU && !IsVALU)
3289 return false;
3290
3291 // The hazard sequence is three instructions:
3292 // 1. VALU reads SGPR as mask
3293 // 2. VALU/SALU writes SGPR
3294 // 3. VALU/SALU reads SGPR
3295 // The hazard can expire if the distance between 2 and 3 is sufficient,
3296 // or (2) is VALU and (3) is SALU.
3297 // In practice this happens <10% of the time, hence always assume the hazard
3298 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3299
3300 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3301 const MachineRegisterInfo &MRI = MF.getRegInfo();
3302
3303 auto IgnoreableSGPR = [](const Register Reg) {
3304 switch (Reg) {
3305 case AMDGPU::EXEC:
3306 case AMDGPU::EXEC_LO:
3307 case AMDGPU::EXEC_HI:
3308 case AMDGPU::M0:
3309 case AMDGPU::SGPR_NULL:
3310 case AMDGPU::SGPR_NULL64:
3311 case AMDGPU::SCC:
3312 return true;
3313 default:
3314 return false;
3315 }
3316 };
3317 auto IsVCC = [](const Register Reg) {
3318 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3319 };
3320
3321 struct StateType {
3322 SmallSet<Register, 2> HazardSGPRs;
3323
3324 static unsigned getHashValue(const StateType &State) {
3325 return hash_combine_range(State.HazardSGPRs);
3326 }
3327 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3328 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3329 }
3330 };
3331
3332 SmallVector<const MachineInstr *> WaitInstrs;
3333 bool HasSGPRRead = false;
3334 StateType InitialState;
3335
3336 // Look for SGPR write.
3337 MachineOperand *HazardDef = nullptr;
3338 for (MachineOperand &Op : MI->operands()) {
3339 if (!Op.isReg())
3340 continue;
3341 if (Op.isDef() && HazardDef)
3342 continue;
3343
3344 Register Reg = Op.getReg();
3345 if (IgnoreableSGPR(Reg))
3346 continue;
3347 if (!IsVCC(Reg)) {
3348 if (Op.isImplicit())
3349 continue;
3350 if (!TRI->isSGPRReg(MRI, Reg))
3351 continue;
3352 }
3353 // Also check for SGPR reads.
3354 if (Op.isUse()) {
3355 HasSGPRRead = true;
3356 continue;
3357 }
3358
3359 assert(!HazardDef);
3360 HazardDef = &Op;
3361 }
3362
3363 if (!HazardDef)
3364 return false;
3365
3366 // Setup to track writes to individual SGPRs
3367 const Register HazardReg = HazardDef->getReg();
3368 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3369 InitialState.HazardSGPRs.insert(HazardReg);
3370 } else {
3371 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3372 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3373 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3374 }
3375
3376 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3377 if (State.HazardSGPRs.empty())
3378 return HazardExpired;
3379
3380 switch (I.getOpcode()) {
3381 case AMDGPU::V_ADDC_U32_e32:
3382 case AMDGPU::V_ADDC_U32_dpp:
3383 case AMDGPU::V_CNDMASK_B16_t16_e32:
3384 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3385 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3386 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3387 case AMDGPU::V_CNDMASK_B32_e32:
3388 case AMDGPU::V_CNDMASK_B32_dpp:
3389 case AMDGPU::V_DIV_FMAS_F32_e64:
3390 case AMDGPU::V_DIV_FMAS_F64_e64:
3391 case AMDGPU::V_SUBB_U32_e32:
3392 case AMDGPU::V_SUBB_U32_dpp:
3393 case AMDGPU::V_SUBBREV_U32_e32:
3394 case AMDGPU::V_SUBBREV_U32_dpp: {
3395 // These implicitly read VCC as mask source.
3396 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3397 }
3398 case AMDGPU::V_ADDC_U32_e64:
3399 case AMDGPU::V_ADDC_U32_e64_dpp:
3400 case AMDGPU::V_CNDMASK_B16_t16_e64:
3401 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3402 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3403 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3404 case AMDGPU::V_CNDMASK_B32_e64:
3405 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3406 case AMDGPU::V_SUBB_U32_e64:
3407 case AMDGPU::V_SUBB_U32_e64_dpp:
3408 case AMDGPU::V_SUBBREV_U32_e64:
3409 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3410 // Only check mask register overlaps.
3411 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3412 assert(SSRCOp);
3413 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3414 return Result ? HazardFound : NoHazardFound;
3415 }
3416 default:
3417 return NoHazardFound;
3418 }
3419 };
3420
3421 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3423 0),
3424 0);
3425 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3426 switch (I.getOpcode()) {
3427 case AMDGPU::S_WAITCNT_DEPCTR:
3428 // Record mergable waits within region of instructions free of SGPR reads.
3429 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3430 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3431 WaitInstrs.push_back(&I);
3432 break;
3433 default:
3434 // Update tracking of SGPR reads and writes.
3435 for (auto &Op : I.operands()) {
3436 if (!Op.isReg())
3437 continue;
3438
3439 Register Reg = Op.getReg();
3440 if (IgnoreableSGPR(Reg))
3441 continue;
3442 if (!IsVCC(Reg)) {
3443 if (Op.isImplicit())
3444 continue;
3445 if (!TRI->isSGPRReg(MRI, Reg))
3446 continue;
3447 }
3448 if (Op.isUse()) {
3449 HasSGPRRead = true;
3450 continue;
3451 }
3452
3453 // Stop tracking any SGPRs with writes on the basis that they will
3454 // already have an appropriate wait inserted afterwards.
3456 for (Register SGPR : State.HazardSGPRs) {
3457 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3458 Found.push_back(SGPR);
3459 }
3460 for (Register SGPR : Found)
3461 State.HazardSGPRs.erase(SGPR);
3462 }
3463 break;
3464 }
3465 };
3466
3467 // Check for hazard
3468 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3469 MI->getParent(),
3470 std::next(MI->getReverseIterator())))
3471 return false;
3472
3473 // Compute counter mask
3474 unsigned DepCtr =
3475 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3476 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3477 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3478
3479 // Try to merge previous waits into this one for regions with no SGPR reads.
3480 if (!WaitInstrs.empty()) {
3481 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3482 // obtain a mutable pointer to each instruction to be merged.
3483 // This is expected to be a very short walk within the same block.
3484 SmallVector<MachineInstr *> ToErase;
3485 unsigned Found = 0;
3486 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3487 End = MI->getParent()->rend();
3488 Found < WaitInstrs.size() && It != End; ++It) {
3489 MachineInstr *WaitMI = &*It;
3490 // Find next wait instruction.
3491 if (std::as_const(WaitMI) != WaitInstrs[Found])
3492 continue;
3493 Found++;
3494 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3495 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3496 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3497 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3498 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3499 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3500 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3501 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3502 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3503 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3504 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3505 ToErase.push_back(WaitMI);
3506 }
3507 assert(Found == WaitInstrs.size());
3508 for (MachineInstr *WaitMI : ToErase)
3509 WaitMI->eraseFromParent();
3510 }
3511
3512 // Add s_waitcnt_depctr after SGPR write.
3513 auto NextMI = std::next(MI->getIterator());
3514 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3515 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3516 .addImm(DepCtr);
3517
3518 // SALU write may be s_getpc in a bundle.
3519 updateGetPCBundle(NewMI);
3520
3521 return true;
3522}
3523
3524static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3525 const SIInstrInfo &TII) {
3526 MachineBasicBlock &EntryMBB = MF->front();
3527 if (EntryMBB.begin() != EntryMBB.end()) {
3528 auto &EntryMI = *EntryMBB.begin();
3529 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3530 EntryMI.getOperand(0).getImm() >= Priority)
3531 return false;
3532 }
3533
3534 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3535 .addImm(Priority);
3536 return true;
3537}
3538
3539bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3540 if (!ST.hasRequiredExportPriority())
3541 return false;
3542
3543 // Assume the following shader types will never have exports,
3544 // and avoid adding or adjusting S_SETPRIO.
3545 MachineBasicBlock *MBB = MI->getParent();
3546 MachineFunction *MF = MBB->getParent();
3547 auto CC = MF->getFunction().getCallingConv();
3548 switch (CC) {
3553 return false;
3554 default:
3555 break;
3556 }
3557
3558 const int MaxPriority = 3;
3559 const int NormalPriority = 2;
3560 const int PostExportPriority = 0;
3561
3562 auto It = MI->getIterator();
3563 switch (MI->getOpcode()) {
3564 case AMDGPU::S_ENDPGM:
3565 case AMDGPU::S_ENDPGM_SAVED:
3566 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3567 case AMDGPU::SI_RETURN_TO_EPILOG:
3568 // Ensure shader with calls raises priority at entry.
3569 // This ensures correct priority if exports exist in callee.
3570 if (MF->getFrameInfo().hasCalls())
3571 return ensureEntrySetPrio(MF, NormalPriority, TII);
3572 return false;
3573 case AMDGPU::S_SETPRIO: {
3574 // Raise minimum priority unless in workaround.
3575 auto &PrioOp = MI->getOperand(0);
3576 int Prio = PrioOp.getImm();
3577 bool InWA = (Prio == PostExportPriority) &&
3578 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3579 if (InWA || Prio >= NormalPriority)
3580 return false;
3581 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3582 return true;
3583 }
3584 default:
3585 if (!TII.isEXP(*MI))
3586 return false;
3587 break;
3588 }
3589
3590 // Check entry priority at each export (as there will only be a few).
3591 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3592 bool Changed = false;
3594 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3595
3596 auto NextMI = std::next(It);
3597 bool EndOfShader = false;
3598 if (NextMI != MBB->end()) {
3599 // Only need WA at end of sequence of exports.
3600 if (TII.isEXP(*NextMI))
3601 return Changed;
3602 // Assume appropriate S_SETPRIO after export means WA already applied.
3603 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3604 NextMI->getOperand(0).getImm() == PostExportPriority)
3605 return Changed;
3606 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3607 }
3608
3609 const DebugLoc &DL = MI->getDebugLoc();
3610
3611 // Lower priority.
3612 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3613 .addImm(PostExportPriority);
3614
3615 if (!EndOfShader) {
3616 // Wait for exports to complete.
3617 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3618 .addReg(AMDGPU::SGPR_NULL)
3619 .addImm(0);
3620 }
3621
3622 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3623 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3624
3625 if (!EndOfShader) {
3626 // Return to normal (higher) priority.
3627 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3628 .addImm(NormalPriority);
3629 }
3630
3631 return true;
3632}
3633
3634bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3635 if (!isSGetReg(MI->getOpcode()))
3636 return false;
3637
3638 const SIInstrInfo *TII = ST.getInstrInfo();
3639 switch (getHWReg(TII, *MI)) {
3640 default:
3641 return false;
3646 break;
3647 }
3648
3649 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3650 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3651 .addImm(0);
3652 return true;
3653}
3654
3655bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3656 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3657 return false;
3658
3659 const SIInstrInfo *TII = ST.getInstrInfo();
3660 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3661 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3662 .addImm(0xFFE3);
3663 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3664 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3665 .addImm(0xFFE3);
3666
3667 return true;
3668}
3669
3670bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3671 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3672 // for hazard to trigger.
3673 if (!IsHazardRecognizerMode)
3674 return false;
3675
3676 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3677 const SIInstrInfo *TII = ST.getInstrInfo();
3678 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3679 const int FlatScrBaseWaitStates = 10;
3680
3681 bool ReadsFlatScrLo =
3682 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3683 bool ReadsFlatScrHi =
3684 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3685 if (isSGetReg(MI->getOpcode())) {
3686 switch (getHWReg(TII, *MI)) {
3687 default:
3688 break;
3690 ReadsFlatScrLo = true;
3691 break;
3693 ReadsFlatScrHi = true;
3694 break;
3695 }
3696 }
3697
3698 const MachineRegisterInfo &MRI = MF.getRegInfo();
3699
3700 auto IsRegDefHazard = [&](Register Reg) -> bool {
3701 DenseSet<const MachineBasicBlock *> Visited;
3702 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3703 return MI.modifiesRegister(Reg, TRI);
3704 };
3705
3706 // This literally abuses the idea of waitstates. Instead of waitstates it
3707 // returns 1 for SGPR written and 0 otherwise.
3708 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3709 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3710 return 0;
3711 for (const MachineOperand &MO : MI.all_defs()) {
3712 if (TRI->isSGPRReg(MRI, MO.getReg()))
3713 return 1;
3714 }
3715 return 0;
3716 };
3717
3718 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3719 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3720 unsigned Wait = MI.getOperand(0).getImm();
3723 return true;
3724 }
3725 return SgprWrites >= FlatScrBaseWaitStates;
3726 };
3727
3728 return ::getWaitStatesSince(
3729 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3730 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3731 };
3732
3733 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3734 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3735 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3736 !IsRegDefHazard(AMDGPU::SGPR103)))
3737 return false;
3738
3739 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3740 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3743 return true;
3744}
3745
3746bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3747 if (!isSSetReg(MI->getOpcode()) ||
3748 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3749 return false;
3750
3751 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3752 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3753 return true;
3754}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
Op::Description Desc
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...