41 #define DEBUG_TYPE "si-insert-waitcnts"
44 "Force emit s_waitcnt expcnt(0) instrs");
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
48 "Force emit s_waitcnt vmcnt(0) instrs");
51 "amdgpu-waitcnt-forcezero",
52 cl::desc(
"Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 #define CNT_MASK(t) (1u << (t))
72 auto inst_counter_types() {
return enum_seq(
VM_CNT, NUM_INST_CNTS); }
74 using RegInterval = std::pair<int, int>;
76 struct HardwareLimits {
83 struct RegisterEncoding {
107 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
111 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113 (1 << VMEM_WRITE_ACCESS)};
121 enum RegisterMapping {
122 SQ_MAX_PGM_VGPRS = 512,
124 SQ_MAX_PGM_SGPRS = 256,
127 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS,
147 return VMEM_NOSAMPLER;
149 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
151 return BaseInfo->BVH ? VMEM_BVH
152 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
155 void addWait(AMDGPU::Waitcnt &
Wait, InstCounterType T,
unsigned Count) {
182 class WaitcntBrackets {
184 WaitcntBrackets(
const GCNSubtarget *SubTarget, HardwareLimits Limits,
185 RegisterEncoding Encoding)
186 :
ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
188 unsigned getWaitCountMax(InstCounterType T)
const {
191 return Limits.VmcntMax;
193 return Limits.LgkmcntMax;
195 return Limits.ExpcntMax;
197 return Limits.VscntMax;
204 unsigned getScoreLB(InstCounterType T)
const {
205 assert(T < NUM_INST_CNTS);
209 unsigned getScoreUB(InstCounterType T)
const {
210 assert(T < NUM_INST_CNTS);
215 InstCounterType eventCounter(WaitEventType
E) {
216 if (WaitEventMaskForInst[
VM_CNT] & (1 <<
E))
218 if (WaitEventMaskForInst[
LGKM_CNT] & (1 <<
E))
220 if (WaitEventMaskForInst[VS_CNT] & (1 <<
E))
226 unsigned getRegScore(
int GprNo, InstCounterType T) {
227 if (GprNo < NUM_ALL_VGPRS) {
228 return VgprScores[
T][GprNo];
231 return SgprScores[GprNo - NUM_ALL_VGPRS];
234 bool merge(
const WaitcntBrackets &Other);
240 bool counterOutOfOrder(InstCounterType T)
const;
241 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const;
242 void simplifyWaitcnt(InstCounterType T,
unsigned &Count)
const;
243 void determineWait(InstCounterType T,
unsigned ScoreToWait,
244 AMDGPU::Waitcnt &
Wait)
const;
245 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
246 void applyWaitcnt(InstCounterType T,
unsigned Count);
251 bool hasPending()
const {
return PendingEvents != 0; }
252 bool hasPendingEvent(WaitEventType
E)
const {
253 return PendingEvents & (1 <<
E);
256 bool hasMixedPendingEvents(InstCounterType T)
const {
257 unsigned Events = PendingEvents & WaitEventMaskForInst[
T];
259 return Events & (Events - 1);
262 bool hasPendingFlat()
const {
269 void setPendingFlat() {
276 bool hasOtherPendingVmemTypes(
int GprNo, VmemType V)
const {
277 assert(GprNo < NUM_ALL_VGPRS);
278 return VgprVmemTypes[GprNo] & ~(1 << V);
281 void clearVgprVmemTypes(
int GprNo) {
282 assert(GprNo < NUM_ALL_VGPRS);
283 VgprVmemTypes[GprNo] = 0;
296 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
297 unsigned OtherScore);
299 void setScoreLB(InstCounterType T,
unsigned Val) {
300 assert(T < NUM_INST_CNTS);
304 void setScoreUB(InstCounterType T,
unsigned Val) {
305 assert(T < NUM_INST_CNTS);
308 unsigned UB = ScoreUBs[
T] - getWaitCountMax(
EXP_CNT);
309 if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
314 void setRegScore(
int GprNo, InstCounterType T,
unsigned Val) {
315 if (GprNo < NUM_ALL_VGPRS) {
317 VgprScores[
T][GprNo] = Val;
320 SgprUB =
std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
321 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
327 unsigned OpNo,
unsigned Val);
330 HardwareLimits Limits = {};
331 RegisterEncoding Encoding = {};
332 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
333 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
334 unsigned PendingEvents = 0;
336 unsigned LastFlat[NUM_INST_CNTS] = {0};
341 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
343 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
346 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
355 AMDGPU::IsaVersion
IV;
365 std::unique_ptr<WaitcntBrackets> Incoming;
375 bool ForceEmitZeroWaitcnts;
376 bool ForceEmitWaitcnt[NUM_INST_CNTS];
382 (void)ForceExpCounter;
383 (void)ForceLgkmCounter;
384 (void)ForceVMCounter;
387 bool shouldFlushVmCnt(
MachineLoop *ML, WaitcntBrackets &Brackets);
389 WaitcntBrackets &ScoreBrackets);
393 return "SI insert wait instructions";
403 bool isForceEmitWaitcnt()
const {
404 for (
auto T : inst_counter_types())
405 if (ForceEmitWaitcnt[T])
410 void setForceEmitWaitcnt() {
416 ForceEmitWaitcnt[
EXP_CNT] =
true;
418 ForceEmitWaitcnt[
EXP_CNT] =
false;
430 ForceEmitWaitcnt[
VM_CNT] =
true;
432 ForceEmitWaitcnt[
VM_CNT] =
false;
440 WaitcntBrackets &ScoreBrackets,
444 WaitcntBrackets &ScoreBrackets,
446 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
451 WaitcntBrackets *ScoreBrackets);
453 WaitcntBrackets &ScoreBrackets);
454 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
456 AMDGPU::Waitcnt &
Wait,
462 RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
466 unsigned OpNo)
const {
479 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
483 Result.first += AGPR_OFFSET;
485 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg())) {
486 assert(
Reg >= Encoding.SGPR0 &&
Reg < SQ_MAX_PGM_SGPRS);
487 Result.first =
Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
489 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
509 assert(
TRI->isVectorRegister(*
MRI,
MI->getOperand(OpNo).getReg()));
511 setRegScore(RegNo,
EXP_CNT, Val);
520 MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
527 InstCounterType
T = eventCounter(
E);
528 unsigned CurrScore = getScoreUB(T) + 1;
534 PendingEvents |= 1 <<
E;
535 setScoreUB(T, CurrScore);
545 if (AddrOpIdx != -1) {
546 setExpScore(&Inst,
TII,
TRI,
MRI, AddrOpIdx, CurrScore);
551 AMDGPU::OpName::data0) != -1) {
558 AMDGPU::OpName::data1) != -1) {
561 AMDGPU::OpName::data1),
565 Inst.
getOpcode() != AMDGPU::DS_GWS_INIT &&
566 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
567 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
568 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
569 Inst.
getOpcode() != AMDGPU::DS_GWS_BARRIER &&
571 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
575 if (
Op.isReg() && !
Op.isDef() &&
576 TRI->isVectorRegister(*
MRI,
Op.getReg())) {
577 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
581 }
else if (
TII->isFLAT(Inst)) {
593 }
else if (
TII->isMIMG(Inst)) {
595 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
602 }
else if (
TII->isMTBUF(Inst)) {
604 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
606 }
else if (
TII->isMUBUF(Inst)) {
608 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
615 }
else if (
TII->isLDSDIR(Inst)) {
622 if (
TII->isEXP(Inst)) {
641 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
645 #if 0 // TODO: check if this is handled by MUBUF code above.
646 }
else if (Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
647 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
648 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
654 setRegScore(RegNo + NUM_ALL_VGPRS,
t, CurrScore);
661 if (!
Op.isReg() || !
Op.isDef())
665 if (
Interval.first >= NUM_ALL_VGPRS)
668 VmemType V = getVmemType(Inst);
670 VgprVmemTypes[RegNo] |= 1 << V;
674 setRegScore(RegNo, T, CurrScore);
678 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
685 for (
auto T : inst_counter_types()) {
686 unsigned LB = getScoreLB(T);
687 unsigned UB = getScoreUB(T);
691 OS <<
" VM_CNT(" << UB - LB <<
"): ";
694 OS <<
" LGKM_CNT(" << UB - LB <<
"): ";
697 OS <<
" EXP_CNT(" << UB - LB <<
"): ";
700 OS <<
" VS_CNT(" << UB - LB <<
"): ";
703 OS <<
" UNKNOWN(" << UB - LB <<
"): ";
709 for (
int J = 0; J <= VgprUB; J++) {
710 unsigned RegScore = getRegScore(J, T);
713 unsigned RelScore = RegScore - LB - 1;
714 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
715 OS << RelScore <<
":v" << J <<
" ";
717 OS << RelScore <<
":ds ";
722 for (
int J = 0; J <= SgprUB; J++) {
723 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS,
LGKM_CNT);
726 unsigned RelScore = RegScore - LB - 1;
727 OS << RelScore <<
":s" << J <<
" ";
738 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
742 simplifyWaitcnt(VS_CNT,
Wait.VsCnt);
745 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
746 unsigned &Count)
const {
747 const unsigned LB = getScoreLB(T);
748 const unsigned UB = getScoreUB(T);
753 if (Count >= UB - LB)
757 void WaitcntBrackets::determineWait(InstCounterType T,
unsigned ScoreToWait,
758 AMDGPU::Waitcnt &
Wait)
const {
761 const unsigned LB = getScoreLB(T);
762 const unsigned UB = getScoreUB(T);
763 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
766 !
ST->hasFlatLgkmVMemCountInOrder()) {
771 }
else if (counterOutOfOrder(T)) {
779 unsigned NeededWait =
std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
780 addWait(
Wait, T, NeededWait);
785 void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
789 applyWaitcnt(VS_CNT,
Wait.VsCnt);
792 void WaitcntBrackets::applyWaitcnt(InstCounterType T,
unsigned Count) {
793 const unsigned UB = getScoreUB(T);
797 if (counterOutOfOrder(T))
799 setScoreLB(T,
std::max(getScoreLB(T), UB - Count));
802 PendingEvents &= ~WaitEventMaskForInst[
T];
808 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T)
const {
810 if (T ==
LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
812 return hasMixedPendingEvents(T);
822 char SIInsertWaitcnts::
ID = 0;
827 return new SIInsertWaitcnts();
834 bool SIInsertWaitcnts::applyPreexistingWaitcnt(
835 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
843 if (II.isMetaInstruction())
846 if (II.getOpcode() == AMDGPU::S_WAITCNT) {
850 if (!TrackedWaitcntSet.
count(&II)) {
865 assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
866 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
867 if (!TrackedWaitcntSet.
count(&II)) {
869 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
873 if (!WaitcntVsCntInstr) {
874 WaitcntVsCntInstr = &II;
884 if (
Wait.hasWaitExceptVsCnt()) {
887 if (OldEnc != NewEnc) {
891 ScoreBrackets.applyWaitcnt(
Wait);
897 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
898 <<
"New Instr at block end: " << *WaitcntInstr
900 :
dbgs() <<
"applyPreexistingWaitcnt\n"
901 <<
"Old Instr: " << *It
902 <<
"New Instr: " << *WaitcntInstr <<
'\n');
910 if (WaitcntVsCntInstr) {
911 if (
Wait.hasWaitVsCnt()) {
914 TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
916 if (
Wait.VsCnt != OldVSCnt) {
917 TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
918 ->setImm(
Wait.VsCnt);
921 ScoreBrackets.applyWaitcnt(
Wait);
925 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
926 <<
"New Instr at block end: "
927 << *WaitcntVsCntInstr <<
'\n'
928 :
dbgs() <<
"applyPreexistingWaitcnt\n"
929 <<
"Old Instr: " << *It
930 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
941 unsigned Opc =
MI.getOpcode();
942 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
943 !
MI.getOperand(1).isUndef();
973 bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &
MI,
974 WaitcntBrackets &ScoreBrackets,
977 setForceEmitWaitcnt();
979 if (
MI.isMetaInstruction())
982 AMDGPU::Waitcnt
Wait;
988 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
989 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
990 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
991 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
992 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
999 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1000 MI.getOpcode() == AMDGPU::SI_RETURN ||
1001 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1006 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1007 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1008 ST->hasLegacyGeometry() &&
1013 #if 0 // TODO: the following blocks of logic when we have fence.
1014 else if (
MI.getOpcode() == SC_FENCE) {
1015 const unsigned int group_size =
1016 context->shader_info->GetMaxThreadGroupSize();
1018 const bool group_is_multi_wave =
1019 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1020 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1022 for (
unsigned int i = 0;
i < Inst->NumSrcOperands();
i++) {
1023 SCRegType src_type = Inst->GetSrcType(
i);
1026 if (group_is_multi_wave ||
1027 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1028 EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT,
1029 ScoreBrackets->getScoreUB(
LGKM_CNT));
1031 if (target_info->HasBufferLoadToLDS()) {
1032 EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT,
1033 ScoreBrackets->getScoreUB(
VM_CNT));
1039 if (group_is_multi_wave || fence_is_global) {
1040 EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT,
1041 ScoreBrackets->getScoreUB(
EXP_CNT));
1042 EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT,
1043 ScoreBrackets->getScoreUB(
LGKM_CNT));
1051 if (group_is_multi_wave || fence_is_global) {
1052 EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT,
1053 ScoreBrackets->getScoreUB(
EXP_CNT));
1054 EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT,
1055 ScoreBrackets->getScoreUB(
VM_CNT));
1072 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1075 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1076 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1077 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1078 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1087 Wait = AMDGPU::Waitcnt();
1092 if (
MI.getOperand(CallAddrOpIdx).isReg()) {
1093 RegInterval CallAddrOpInterval =
1094 ScoreBrackets.getRegInterval(&
MI,
TII,
MRI,
TRI, CallAddrOpIdx);
1096 for (
int RegNo = CallAddrOpInterval.first;
1097 RegNo < CallAddrOpInterval.second; ++RegNo)
1098 ScoreBrackets.determineWait(
1103 if (RtnAddrOpIdx != -1) {
1104 RegInterval RtnAddrOpInterval =
1105 ScoreBrackets.getRegInterval(&
MI,
TII,
MRI,
TRI, RtnAddrOpIdx);
1107 for (
int RegNo = RtnAddrOpInterval.first;
1108 RegNo < RtnAddrOpInterval.second; ++RegNo)
1109 ScoreBrackets.determineWait(
1128 const Value *Ptr = Memop->getValue();
1129 if (Memop->isStore() && SLoadAddresses.
count(Ptr)) {
1132 SLoadAddresses.
erase(Ptr);
1134 unsigned AS = Memop->getAddrSpace();
1140 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1142 ScoreBrackets.determineWait(
1144 if (Memop->isStore()) {
1145 ScoreBrackets.determineWait(
1151 for (
unsigned I = 0,
E =
MI.getNumOperands();
I !=
E; ++
I) {
1158 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
1166 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1168 ScoreBrackets.determineWait(
1170 ScoreBrackets.clearVgprVmemTypes(RegNo);
1172 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1173 ScoreBrackets.determineWait(
1177 ScoreBrackets.determineWait(
1188 if (
MI.getOpcode() == AMDGPU::S_BARRIER &&
1189 !
ST->hasAutoWaitcntBeforeBarrier()) {
1197 if (ScoreBrackets.getScoreLB(
LGKM_CNT) <
1198 ScoreBrackets.getScoreUB(
LGKM_CNT) &&
1199 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1205 ScoreBrackets.simplifyWaitcnt(
Wait);
1207 if (ForceEmitZeroWaitcnts)
1210 if (ForceEmitWaitcnt[
VM_CNT])
1212 if (ForceEmitWaitcnt[
EXP_CNT])
1216 if (ForceEmitWaitcnt[VS_CNT])
1220 unsigned UB = ScoreBrackets.getScoreUB(
VM_CNT);
1221 unsigned LB = ScoreBrackets.getScoreLB(
VM_CNT);
1226 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
1233 WaitcntBrackets &ScoreBrackets,
1235 AMDGPU::Waitcnt
Wait;
1237 unsigned UB = ScoreBrackets.getScoreUB(
VM_CNT);
1238 unsigned LB = ScoreBrackets.getScoreLB(
VM_CNT);
1244 return generateWaitcnt(
Wait,
Block.instr_end(), Block, ScoreBrackets,
1248 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
1251 WaitcntBrackets &ScoreBrackets,
1256 if (OldWaitcntInstr)
1260 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
1262 ScoreBrackets.applyWaitcnt(
Wait);
1265 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
1268 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1276 <<
"Update Instr: " << *It);
1281 if (
Wait.hasWaitExceptVsCnt()) {
1285 TrackedWaitcntSet.
insert(SWaitInst);
1289 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1290 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1293 if (
Wait.hasWaitVsCnt()) {
1296 auto SWaitInst =
BuildMI(Block, It,
DL,
TII->get(AMDGPU::S_WAITCNT_VSCNT))
1299 TrackedWaitcntSet.
insert(SWaitInst);
1303 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1304 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1312 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
1320 if (
MI.memoperands_empty())
1329 unsigned AS = Memop->getAddrSpace();
1340 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
1344 if (!
TII->usesLGKM_CNT(
MI))
1348 if (
ST->isTgSplitEnabled())
1353 if (
MI.memoperands_empty())
1358 unsigned AS = Memop->getAddrSpace();
1366 void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
1367 WaitcntBrackets *ScoreBrackets) {
1372 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
1374 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1375 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
1376 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
1378 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1380 }
else if (
TII->isFLAT(Inst)) {
1383 int FlatASCount = 0;
1385 if (mayAccessVMEMThroughFlat(Inst)) {
1387 if (!
ST->hasVscnt())
1388 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_ACCESS, Inst);
1390 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_READ_ACCESS, Inst);
1392 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_WRITE_ACCESS, Inst);
1395 if (mayAccessLDSThroughFlat(Inst)) {
1397 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1406 if (FlatASCount > 1)
1407 ScoreBrackets->setPendingFlat();
1410 if (!
ST->hasVscnt())
1411 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_ACCESS, Inst);
1415 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_READ_ACCESS, Inst);
1417 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_WRITE_ACCESS, Inst);
1419 if (
ST->vmemWriteNeedsExpWaitcnt() &&
1421 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
1423 }
else if (
TII->isSMRD(Inst)) {
1424 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1425 }
else if (Inst.
isCall()) {
1431 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1434 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
1435 }
else if (
TII->isVINTERP(Inst)) {
1436 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1439 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1441 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
1443 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
1445 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
1448 case AMDGPU::S_SENDMSG:
1449 case AMDGPU::S_SENDMSG_RTN_B32:
1450 case AMDGPU::S_SENDMSG_RTN_B64:
1451 case AMDGPU::S_SENDMSGHALT:
1452 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
1454 case AMDGPU::S_MEMTIME:
1455 case AMDGPU::S_MEMREALTIME:
1456 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1462 bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
1463 unsigned OtherScore) {
1464 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
1465 unsigned OtherShifted =
1466 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
1467 Score =
std::max(MyShifted, OtherShifted);
1468 return OtherShifted > MyShifted;
1477 bool StrictDom =
false;
1482 for (
auto T : inst_counter_types()) {
1484 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
1485 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
1486 if (OtherEvents & ~OldEvents)
1488 PendingEvents |= OtherEvents;
1491 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
1492 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
1493 const unsigned NewUB = ScoreLBs[
T] +
std::max(MyPending, OtherPending);
1494 if (NewUB < ScoreLBs[T])
1498 M.OldLB = ScoreLBs[
T];
1499 M.OtherLB =
Other.ScoreLBs[
T];
1500 M.MyShift = NewUB - ScoreUBs[
T];
1501 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
1503 ScoreUBs[
T] = NewUB;
1505 StrictDom |= mergeScore(M, LastFlat[T],
Other.LastFlat[T]);
1507 bool RegStrictDom =
false;
1508 for (
int J = 0; J <= VgprUB; J++) {
1509 RegStrictDom |= mergeScore(M, VgprScores[T][J],
Other.VgprScores[T][J]);
1513 for (
int J = 0; J <= VgprUB; J++) {
1514 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
1515 RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1516 VgprVmemTypes[J] = NewVmemTypes;
1521 for (
int J = 0; J <= SgprUB; J++) {
1522 RegStrictDom |= mergeScore(M, SgprScores[J],
Other.SgprScores[J]);
1536 WaitcntBrackets &ScoreBrackets) {
1540 dbgs() <<
"*** Block" <<
Block.getNumber() <<
" ***";
1541 ScoreBrackets.dump();
1547 bool VCCZCorrect =
true;
1548 if (
ST->hasReadVCCZBug()) {
1551 VCCZCorrect =
false;
1552 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
1555 VCCZCorrect =
false;
1568 if (Inst.
getOpcode() == AMDGPU::S_WAITCNT ||
1569 (Inst.
getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1572 if (!OldWaitcntInstr)
1573 OldWaitcntInstr = &Inst;
1578 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
1579 isPreheaderToFlush(Block, ScoreBrackets);
1582 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1584 OldWaitcntInstr =
nullptr;
1587 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
1590 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
1594 if (!
ST->partialVCCWritesUpdateVCCZ())
1595 VCCZCorrect =
false;
1604 if (
ST->hasReadVCCZBug() &&
1605 ScoreBrackets.getScoreLB(
LGKM_CNT) <
1606 ScoreBrackets.getScoreUB(
LGKM_CNT) &&
1607 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1610 VCCZCorrect =
false;
1618 if (
TII->isSMRD(Inst)) {
1622 if (!Memop->isInvariant()) {
1623 const Value *Ptr = Memop->getValue();
1627 if (
ST->hasReadVCCZBug()) {
1629 VCCZCorrect =
false;
1633 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1635 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1639 if (RequireCheckResourceType(Inst, context)) {
1641 ScoreBrackets->setScoreLB(
VM_CNT,
1642 ScoreBrackets->getScoreUB(
VM_CNT));
1648 ScoreBrackets.dump();
1658 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1668 if (
Block.getFirstTerminator() ==
Block.end() &&
1669 isPreheaderToFlush(Block, ScoreBrackets))
1670 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1678 WaitcntBrackets &ScoreBrackets) {
1680 return PreheadersToFlush[&
MBB];
1682 auto UpdateCache = [&](
bool val) {
1683 PreheadersToFlush[&
MBB] =
val;
1689 return UpdateCache(
false);
1693 return UpdateCache(
false);
1696 return UpdateCache(
true);
1698 return UpdateCache(
false);
1709 bool SIInsertWaitcnts::shouldFlushVmCnt(
MachineLoop *ML,
1710 WaitcntBrackets &Brackets) {
1711 bool HasVMemLoad =
false;
1712 bool HasVMemStore =
false;
1713 bool UsesVgprLoadedOutside =
false;
1723 HasVMemStore =
true;
1725 for (
unsigned I = 0;
I <
MI.getNumOperands();
I++) {
1727 if (!
Op.isReg() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1740 if (Brackets.getRegScore(RegNo,
VM_CNT) > 0) {
1741 UsesVgprLoadedOutside =
true;
1758 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1760 return HasVMemLoad && UsesVgprLoadedOutside;
1765 TII =
ST->getInstrInfo();
1766 TRI = &
TII->getRegisterInfo();
1770 MLI = &getAnalysis<MachineLoopInfo>();
1771 PDT = &getAnalysis<MachinePostDominatorTree>();
1774 for (
auto T : inst_counter_types())
1775 ForceEmitWaitcnt[
T] =
false;
1777 HardwareLimits Limits = {};
1781 Limits.VscntMax =
ST->hasVscnt() ? 63 : 0;
1783 unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs();
1784 unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
1785 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1786 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1788 RegisterEncoding Encoding = {};
1790 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1792 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1794 TrackedWaitcntSet.
clear();
1808 I !=
E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
1824 std::unique_ptr<WaitcntBrackets> Brackets;
1829 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
1831 BlockInfo &BI = BII->second;
1837 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1839 *Brackets = *BI.Incoming;
1842 Brackets = std::make_unique<WaitcntBrackets>(
ST, Limits, Encoding);
1844 *Brackets = WaitcntBrackets(
ST, Limits, Encoding);
1847 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1850 if (Brackets->hasPending()) {
1851 BlockInfo *MoveBracketsToSucc =
nullptr;
1853 auto SuccBII = BlockInfos.
find(Succ);
1854 BlockInfo &SuccBI = SuccBII->second;
1855 if (!SuccBI.Incoming) {
1856 SuccBI.Dirty =
true;
1859 if (!MoveBracketsToSucc) {
1860 MoveBracketsToSucc = &SuccBI;
1862 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1864 }
else if (SuccBI.Incoming->merge(*Brackets)) {
1865 SuccBI.Dirty =
true;
1870 if (MoveBracketsToSucc)
1871 MoveBracketsToSucc->Incoming =
std::move(Brackets);
1876 if (
ST->hasScalarStores()) {
1878 bool HaveScalarStores =
false;
1882 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
1883 HaveScalarStores =
true;
1885 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1886 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1887 EndPgmBlocks.push_back(&
MBB);
1891 if (HaveScalarStores) {
1901 bool SeenDCacheWB =
false;
1905 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
1906 SeenDCacheWB =
true;
1907 else if (
TII->isScalarStore(*
I))
1908 SeenDCacheWB =
false;
1911 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
1912 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&