41#define DEBUG_TYPE "si-insert-waitcnts"
44 "Force emit s_waitcnt expcnt(0) instrs");
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
48 "Force emit s_waitcnt vmcnt(0) instrs");
51 "amdgpu-waitcnt-forcezero",
52 cl::desc(
"Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60#define CNT_MASK(t) (1u << (t))
62enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
72auto inst_counter_types() {
return enum_seq(VM_CNT, NUM_INST_CNTS); }
74using RegInterval = std::pair<int, int>;
76struct HardwareLimits {
83struct RegisterEncoding {
107static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
111 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113 (1 << VMEM_WRITE_ACCESS)};
121enum RegisterMapping {
122 SQ_MAX_PGM_VGPRS = 512,
124 SQ_MAX_PGM_SGPRS = 256,
127 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS,
150 assert(updateVMCntOnly(Inst));
152 return VMEM_NOSAMPLER;
155 AMDGPU::getMIMGBaseOpcodeInfo(
Info->BaseOpcode);
156 return BaseInfo->
BVH ? VMEM_BVH
157 : BaseInfo->
Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
163 Wait.VmCnt = std::min(
Wait.VmCnt, Count);
166 Wait.ExpCnt = std::min(
Wait.ExpCnt, Count);
169 Wait.LgkmCnt = std::min(
Wait.LgkmCnt, Count);
172 Wait.VsCnt = std::min(
Wait.VsCnt, Count);
187class WaitcntBrackets {
189 WaitcntBrackets(
const GCNSubtarget *SubTarget, HardwareLimits Limits,
190 RegisterEncoding Encoding)
191 :
ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
193 unsigned getWaitCountMax(InstCounterType
T)
const {
196 return Limits.VmcntMax;
198 return Limits.LgkmcntMax;
200 return Limits.ExpcntMax;
202 return Limits.VscntMax;
209 unsigned getScoreLB(InstCounterType
T)
const {
214 unsigned getScoreUB(InstCounterType
T)
const {
219 unsigned getScoreRange(InstCounterType
T)
const {
220 return getScoreUB(
T) - getScoreLB(
T);
224 InstCounterType eventCounter(WaitEventType
E)
const {
225 for (
auto T : inst_counter_types()) {
226 if (WaitEventMaskForInst[
T] & (1 <<
E))
232 unsigned getRegScore(
int GprNo, InstCounterType
T)
const {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[
T][GprNo];
237 return SgprScores[GprNo - NUM_ALL_VGPRS];
246 bool counterOutOfOrder(InstCounterType
T)
const;
248 void simplifyWaitcnt(InstCounterType
T,
unsigned &Count)
const;
251 void applyWaitcnt(InstCounterType
T,
unsigned Count);
256 unsigned hasPendingEvent()
const {
return PendingEvents; }
257 unsigned hasPendingEvent(WaitEventType
E)
const {
258 return PendingEvents & (1 <<
E);
260 unsigned hasPendingEvent(InstCounterType
T)
const {
261 unsigned HasPending = PendingEvents & WaitEventMaskForInst[
T];
262 assert((HasPending != 0) == (getScoreRange(
T) != 0));
266 bool hasMixedPendingEvents(InstCounterType
T)
const {
267 unsigned Events = hasPendingEvent(
T);
269 return Events & (Events - 1);
272 bool hasPendingFlat()
const {
273 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
274 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
275 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
276 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
279 void setPendingFlat() {
286 bool hasOtherPendingVmemTypes(
int GprNo, VmemType V)
const {
287 assert(GprNo < NUM_ALL_VGPRS);
288 return VgprVmemTypes[GprNo] & ~(1 << V);
291 void clearVgprVmemTypes(
int GprNo) {
292 assert(GprNo < NUM_ALL_VGPRS);
293 VgprVmemTypes[GprNo] = 0;
306 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
307 unsigned OtherScore);
309 void setScoreLB(InstCounterType
T,
unsigned Val) {
314 void setScoreUB(InstCounterType
T,
unsigned Val) {
321 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
325 void setRegScore(
int GprNo, InstCounterType
T,
unsigned Val) {
326 if (GprNo < NUM_ALL_VGPRS) {
327 VgprUB = std::max(VgprUB, GprNo);
328 VgprScores[
T][GprNo] = Val;
331 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
332 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
338 unsigned OpNo,
unsigned Val);
341 HardwareLimits Limits = {};
342 RegisterEncoding Encoding = {};
343 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
344 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
345 unsigned PendingEvents = 0;
347 unsigned LastFlat[NUM_INST_CNTS] = {0};
352 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
354 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
357 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
376 std::unique_ptr<WaitcntBrackets> Incoming;
386 bool ForceEmitZeroWaitcnts;
387 bool ForceEmitWaitcnt[NUM_INST_CNTS];
393 (void)ForceExpCounter;
394 (void)ForceLgkmCounter;
395 (void)ForceVMCounter;
398 bool shouldFlushVmCnt(
MachineLoop *
ML, WaitcntBrackets &Brackets);
400 WaitcntBrackets &ScoreBrackets);
404 return "SI insert wait instructions";
414 bool isForceEmitWaitcnt()
const {
415 for (
auto T : inst_counter_types())
416 if (ForceEmitWaitcnt[
T])
425 void setForceEmitWaitcnt() {
431 ForceEmitWaitcnt[
EXP_CNT] =
true;
433 ForceEmitWaitcnt[
EXP_CNT] =
false;
445 ForceEmitWaitcnt[
VM_CNT] =
true;
447 ForceEmitWaitcnt[
VM_CNT] =
false;
454 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
459 return VMEM_WRITE_ACCESS;
460 return VMEM_READ_ACCESS;
466 WaitcntBrackets &ScoreBrackets,
470 WaitcntBrackets &ScoreBrackets,
477 WaitcntBrackets *ScoreBrackets);
479 WaitcntBrackets &ScoreBrackets);
480 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
488RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
492 unsigned OpNo)
const {
494 if (!
TRI->isInAllocatableClass(
Op.getReg()))
503 unsigned Reg =
TRI->getEncodingValue(AMDGPU::getMCReg(
Op.getReg(), *ST));
505 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
506 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
509 Result.first += AGPR_OFFSET;
511 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg())) {
512 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
513 Result.first =
Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
515 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
523 unsigned Size =
TRI->getRegSizeInBits(*RC);
535 assert(
TRI->isVectorRegister(*
MRI,
MI->getOperand(OpNo).getReg()));
537 setRegScore(RegNo, EXP_CNT, Val);
546 MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
553 InstCounterType
T = eventCounter(
E);
554 unsigned CurrScore = getScoreUB(
T) + 1;
560 PendingEvents |= 1 <<
E;
561 setScoreUB(
T, CurrScore);
568 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::addr);
571 if (AddrOpIdx != -1) {
572 setExpScore(&Inst,
TII,
TRI,
MRI, AddrOpIdx, CurrScore);
576 if (AMDGPU::hasNamedOperand(Inst.
getOpcode(), AMDGPU::OpName::data0)) {
579 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::data0),
582 if (AMDGPU::hasNamedOperand(Inst.
getOpcode(), AMDGPU::OpName::data1)) {
584 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(),
585 AMDGPU::OpName::data1),
589 Inst.
getOpcode() != AMDGPU::DS_GWS_INIT &&
590 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
591 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
592 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
593 Inst.
getOpcode() != AMDGPU::DS_GWS_BARRIER &&
595 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
596 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
599 if (
Op.isReg() && !
Op.isDef() &&
600 TRI->isVectorRegister(*
MRI,
Op.getReg())) {
601 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
605 }
else if (
TII->isFLAT(Inst)) {
609 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::data),
614 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::data),
617 }
else if (
TII->isMIMG(Inst)) {
619 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
623 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::data),
626 }
else if (
TII->isMTBUF(Inst)) {
628 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
630 }
else if (
TII->isMUBUF(Inst)) {
632 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
636 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::data),
639 }
else if (
TII->isLDSDIR(Inst)) {
643 AMDGPU::getNamedOperandIdx(Inst.
getOpcode(), AMDGPU::OpName::vdst),
646 if (
TII->isEXP(Inst)) {
656 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.
getReg(), *ST)),
665 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
670 }
else if (Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
671 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
672 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
678 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
685 if (!
Op.isReg() || !
Op.isDef())
689 if (
Interval.first >= NUM_ALL_VGPRS)
691 if (updateVMCntOnly(Inst)) {
692 VmemType V = getVmemType(Inst);
694 VgprVmemTypes[RegNo] |= 1 << V;
698 setRegScore(RegNo,
T, CurrScore);
702 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS,
T, CurrScore);
709 for (
auto T : inst_counter_types()) {
710 unsigned SR = getScoreRange(
T);
714 OS <<
" VM_CNT(" << SR <<
"): ";
717 OS <<
" LGKM_CNT(" << SR <<
"): ";
720 OS <<
" EXP_CNT(" << SR <<
"): ";
723 OS <<
" VS_CNT(" << SR <<
"): ";
726 OS <<
" UNKNOWN(" << SR <<
"): ";
732 unsigned LB = getScoreLB(
T);
734 for (
int J = 0; J <= VgprUB; J++) {
735 unsigned RegScore = getRegScore(J,
T);
738 unsigned RelScore = RegScore - LB - 1;
739 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
740 OS << RelScore <<
":v" << J <<
" ";
742 OS << RelScore <<
":ds ";
747 for (
int J = 0; J <= SgprUB; J++) {
748 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
751 unsigned RelScore = RegScore - LB - 1;
752 OS << RelScore <<
":s" << J <<
" ";
764 simplifyWaitcnt(VM_CNT,
Wait.VmCnt);
765 simplifyWaitcnt(EXP_CNT,
Wait.ExpCnt);
766 simplifyWaitcnt(LGKM_CNT,
Wait.LgkmCnt);
767 simplifyWaitcnt(VS_CNT,
Wait.VsCnt);
770void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
771 unsigned &Count)
const {
775 if (Count >= getScoreRange(
T))
779void WaitcntBrackets::determineWait(InstCounterType
T,
int RegNo,
781 unsigned ScoreToWait = getRegScore(RegNo,
T);
785 const unsigned LB = getScoreLB(
T);
786 const unsigned UB = getScoreUB(
T);
787 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
788 if ((
T == VM_CNT ||
T == LGKM_CNT) &&
790 !
ST->hasFlatLgkmVMemCountInOrder()) {
795 }
else if (counterOutOfOrder(
T)) {
803 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(
T) - 1);
804 addWait(
Wait,
T, NeededWait);
810 applyWaitcnt(VM_CNT,
Wait.VmCnt);
811 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
812 applyWaitcnt(LGKM_CNT,
Wait.LgkmCnt);
813 applyWaitcnt(VS_CNT,
Wait.VsCnt);
816void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
817 const unsigned UB = getScoreUB(
T);
821 if (counterOutOfOrder(
T))
823 setScoreLB(
T, std::max(getScoreLB(
T), UB - Count));
826 PendingEvents &= ~WaitEventMaskForInst[
T];
832bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
834 if (
T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
836 return hasMixedPendingEvents(
T);
846char SIInsertWaitcnts::
ID = 0;
851 return new SIInsertWaitcnts();
856 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
861 if (NewEnc == MO.
getImm())
872bool SIInsertWaitcnts::applyPreexistingWaitcnt(
873 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
881 if (II.isMetaInstruction())
884 if (II.getOpcode() == AMDGPU::S_WAITCNT) {
888 if (!TrackedWaitcntSet.
count(&II)) {
903 assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
904 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
905 if (!TrackedWaitcntSet.
count(&II)) {
907 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
908 Wait.VsCnt = std::min(
Wait.VsCnt, OldVSCnt);
911 if (!WaitcntVsCntInstr) {
912 WaitcntVsCntInstr = &II;
922 if (
Wait.hasWaitExceptVsCnt()) {
925 AMDGPU::encodeWaitcnt(
IV,
Wait));
926 ScoreBrackets.applyWaitcnt(
Wait);
932 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
933 <<
"New Instr at block end: " << *WaitcntInstr
935 :
dbgs() <<
"applyPreexistingWaitcnt\n"
936 <<
"Old Instr: " << *It
937 <<
"New Instr: " << *WaitcntInstr <<
'\n');
945 if (WaitcntVsCntInstr) {
946 if (
Wait.hasWaitVsCnt()) {
949 AMDGPU::OpName::simm16,
Wait.VsCnt);
950 ScoreBrackets.applyWaitcnt(
Wait);
954 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
955 <<
"New Instr at block end: "
956 << *WaitcntVsCntInstr <<
'\n'
957 :
dbgs() <<
"applyPreexistingWaitcnt\n"
958 <<
"Old Instr: " << *It
959 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
970 unsigned Opc =
MI.getOpcode();
971 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
972 !
MI.getOperand(1).isUndef();
1002bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &
MI,
1003 WaitcntBrackets &ScoreBrackets,
1006 setForceEmitWaitcnt();
1008 if (
MI.isMetaInstruction())
1017 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1018 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1019 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1020 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1021 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1028 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1029 MI.getOpcode() == AMDGPU::SI_RETURN ||
1030 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1032 Wait =
Wait.combined(allZeroWaitcnt());
1035 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1036 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1037 ST->hasLegacyGeometry() &&
1043 else if (
MI.getOpcode() == SC_FENCE) {
1044 const unsigned int group_size =
1045 context->shader_info->GetMaxThreadGroupSize();
1047 const bool group_is_multi_wave =
1048 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1049 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1051 for (
unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1052 SCRegType src_type = Inst->GetSrcType(i);
1055 if (group_is_multi_wave ||
1056 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1057 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1058 ScoreBrackets->getScoreUB(LGKM_CNT));
1060 if (target_info->HasBufferLoadToLDS()) {
1061 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1062 ScoreBrackets->getScoreUB(VM_CNT));
1068 if (group_is_multi_wave || fence_is_global) {
1069 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1070 ScoreBrackets->getScoreUB(EXP_CNT));
1071 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1072 ScoreBrackets->getScoreUB(LGKM_CNT));
1080 if (group_is_multi_wave || fence_is_global) {
1081 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1082 ScoreBrackets->getScoreUB(EXP_CNT));
1083 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1084 ScoreBrackets->getScoreUB(VM_CNT));
1101 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1104 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1105 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1106 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1107 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1119 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src0);
1121 if (
MI.getOperand(CallAddrOpIdx).isReg()) {
1122 RegInterval CallAddrOpInterval =
1123 ScoreBrackets.getRegInterval(&
MI,
TII,
MRI,
TRI, CallAddrOpIdx);
1125 for (
int RegNo = CallAddrOpInterval.first;
1126 RegNo < CallAddrOpInterval.second; ++RegNo)
1127 ScoreBrackets.determineWait(LGKM_CNT, RegNo,
Wait);
1130 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::dst);
1131 if (RtnAddrOpIdx != -1) {
1132 RegInterval RtnAddrOpInterval =
1133 ScoreBrackets.getRegInterval(&
MI,
TII,
MRI,
TRI, RtnAddrOpIdx);
1135 for (
int RegNo = RtnAddrOpInterval.first;
1136 RegNo < RtnAddrOpInterval.second; ++RegNo)
1137 ScoreBrackets.determineWait(LGKM_CNT, RegNo,
Wait);
1155 const Value *
Ptr = Memop->getValue();
1156 if (Memop->isStore() && SLoadAddresses.
count(
Ptr)) {
1157 addWait(
Wait, LGKM_CNT, 0);
1161 unsigned AS = Memop->getAddrSpace();
1167 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1169 ScoreBrackets.determineWait(VM_CNT, RegNo,
Wait);
1170 if (Memop->isStore()) {
1171 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1176 for (
unsigned I = 0,
E =
MI.getNumOperands();
I !=
E; ++
I) {
1182 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
1188 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
1195 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
1196 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1198 ScoreBrackets.determineWait(VM_CNT, RegNo,
Wait);
1199 ScoreBrackets.clearVgprVmemTypes(RegNo);
1201 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1202 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1205 ScoreBrackets.determineWait(LGKM_CNT, RegNo,
Wait);
1215 if (
MI.getOpcode() == AMDGPU::S_BARRIER &&
1216 !
ST->hasAutoWaitcntBeforeBarrier() && !
ST->supportsBackOffBarrier()) {
1217 Wait =
Wait.combined(allZeroWaitcnt());
1224 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1230 ScoreBrackets.simplifyWaitcnt(
Wait);
1232 if (ForceEmitZeroWaitcnts)
1233 Wait = allZeroWaitcnt();
1235 if (ForceEmitWaitcnt[VM_CNT])
1237 if (ForceEmitWaitcnt[EXP_CNT])
1239 if (ForceEmitWaitcnt[LGKM_CNT])
1241 if (ForceEmitWaitcnt[VS_CNT])
1245 if (ScoreBrackets.hasPendingEvent(VM_CNT))
1249 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
1256 WaitcntBrackets &ScoreBrackets,
1260 if (!ScoreBrackets.hasPendingEvent(VM_CNT))
1265 return generateWaitcnt(
Wait,
Block.instr_end(), Block, ScoreBrackets,
1272 WaitcntBrackets &ScoreBrackets,
1277 if (OldWaitcntInstr)
1281 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
1283 ScoreBrackets.applyWaitcnt(
Wait);
1286 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
1289 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1297 <<
"Update Instr: " << *It);
1302 if (
Wait.hasWaitExceptVsCnt()) {
1303 unsigned Enc = AMDGPU::encodeWaitcnt(
IV,
Wait);
1306 TrackedWaitcntSet.
insert(SWaitInst);
1310 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1311 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1314 if (
Wait.hasWaitVsCnt()) {
1317 auto SWaitInst =
BuildMI(Block, It,
DL,
TII->get(AMDGPU::S_WAITCNT_VSCNT))
1320 TrackedWaitcntSet.
insert(SWaitInst);
1324 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1325 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1333bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
1341 if (
MI.memoperands_empty())
1350 unsigned AS = Memop->getAddrSpace();
1361bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
1365 if (!
TII->usesLGKM_CNT(
MI))
1369 if (
ST->isTgSplitEnabled())
1374 if (
MI.memoperands_empty())
1379 unsigned AS = Memop->getAddrSpace();
1387void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
1388 WaitcntBrackets *ScoreBrackets) {
1393 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
1395 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1396 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
1397 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
1399 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1401 }
else if (
TII->isFLAT(Inst)) {
1404 int FlatASCount = 0;
1406 if (mayAccessVMEMThroughFlat(Inst)) {
1408 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
1412 if (mayAccessLDSThroughFlat(Inst)) {
1414 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1423 if (FlatASCount > 1)
1424 ScoreBrackets->setPendingFlat();
1427 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
1430 if (
ST->vmemWriteNeedsExpWaitcnt() &&
1432 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
1434 }
else if (
TII->isSMRD(Inst)) {
1435 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1436 }
else if (Inst.
isCall()) {
1439 ScoreBrackets->applyWaitcnt(allZeroWaitcnt());
1445 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
1446 }
else if (
TII->isVINTERP(Inst)) {
1447 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1448 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1450 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1452 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
1454 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
1456 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
1459 case AMDGPU::S_SENDMSG:
1460 case AMDGPU::S_SENDMSG_RTN_B32:
1461 case AMDGPU::S_SENDMSG_RTN_B64:
1462 case AMDGPU::S_SENDMSGHALT:
1463 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
1465 case AMDGPU::S_MEMTIME:
1466 case AMDGPU::S_MEMREALTIME:
1467 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1473bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
1474 unsigned OtherScore) {
1475 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
1476 unsigned OtherShifted =
1477 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
1478 Score = std::max(MyShifted, OtherShifted);
1479 return OtherShifted > MyShifted;
1487bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
1488 bool StrictDom =
false;
1490 VgprUB = std::max(VgprUB,
Other.VgprUB);
1491 SgprUB = std::max(SgprUB,
Other.SgprUB);
1493 for (
auto T : inst_counter_types()) {
1495 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
1496 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
1497 if (OtherEvents & ~OldEvents)
1499 PendingEvents |= OtherEvents;
1502 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
1503 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
1504 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
1505 if (NewUB < ScoreLBs[
T])
1509 M.OldLB = ScoreLBs[
T];
1510 M.OtherLB =
Other.ScoreLBs[
T];
1511 M.MyShift = NewUB - ScoreUBs[
T];
1512 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
1514 ScoreUBs[
T] = NewUB;
1516 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
1518 for (
int J = 0; J <= VgprUB; J++)
1519 StrictDom |= mergeScore(M, VgprScores[
T][J],
Other.VgprScores[
T][J]);
1521 if (
T == LGKM_CNT) {
1522 for (
int J = 0; J <= SgprUB; J++)
1523 StrictDom |= mergeScore(M, SgprScores[J],
Other.SgprScores[J]);
1527 for (
int J = 0; J <= VgprUB; J++) {
1528 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
1529 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
1530 VgprVmemTypes[J] = NewVmemTypes;
1537 return Inst.
getOpcode() == AMDGPU::S_WAITCNT ||
1538 (Inst.
getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1546 WaitcntBrackets &ScoreBrackets) {
1550 dbgs() <<
"*** Block" <<
Block.getNumber() <<
" ***";
1551 ScoreBrackets.dump();
1557 bool VCCZCorrect =
true;
1558 if (
ST->hasReadVCCZBug()) {
1561 VCCZCorrect =
false;
1562 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
1565 VCCZCorrect =
false;
1579 if (!OldWaitcntInstr)
1580 OldWaitcntInstr = &Inst;
1585 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
1586 isPreheaderToFlush(Block, ScoreBrackets);
1589 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1591 OldWaitcntInstr =
nullptr;
1594 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
1597 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
1601 if (!
ST->partialVCCWritesUpdateVCCZ())
1602 VCCZCorrect =
false;
1611 if (
ST->hasReadVCCZBug() &&
1612 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1615 VCCZCorrect =
false;
1623 if (
TII->isSMRD(Inst)) {
1627 if (!Memop->isInvariant()) {
1628 const Value *
Ptr = Memop->getValue();
1632 if (
ST->hasReadVCCZBug()) {
1634 VCCZCorrect =
false;
1638 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1644 if (RequireCheckResourceType(Inst, context)) {
1646 ScoreBrackets->setScoreLB(VM_CNT,
1647 ScoreBrackets->getScoreUB(VM_CNT));
1653 ScoreBrackets.dump();
1663 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1673 if (
Block.getFirstTerminator() ==
Block.end() &&
1674 isPreheaderToFlush(Block, ScoreBrackets))
1675 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1683 WaitcntBrackets &ScoreBrackets) {
1685 return PreheadersToFlush[&
MBB];
1687 auto UpdateCache = [&](
bool val) {
1688 PreheadersToFlush[&
MBB] = val;
1694 return UpdateCache(
false);
1698 return UpdateCache(
false);
1701 return UpdateCache(
true);
1703 return UpdateCache(
false);
1715 WaitcntBrackets &Brackets) {
1716 bool HasVMemLoad =
false;
1717 bool HasVMemStore =
false;
1718 bool UsesVgprLoadedOutside =
false;
1728 HasVMemStore =
true;
1730 for (
unsigned I = 0;
I <
MI.getNumOperands();
I++) {
1732 if (!
Op.isReg() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1745 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
1746 UsesVgprLoadedOutside =
true;
1763 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1765 return HasVMemLoad && UsesVgprLoadedOutside;
1770 TII =
ST->getInstrInfo();
1771 TRI = &
TII->getRegisterInfo();
1773 IV = AMDGPU::getIsaVersion(
ST->getCPU());
1775 MLI = &getAnalysis<MachineLoopInfo>();
1776 PDT = &getAnalysis<MachinePostDominatorTree>();
1779 for (
auto T : inst_counter_types())
1780 ForceEmitWaitcnt[
T] =
false;
1782 HardwareLimits Limits = {};
1783 Limits.VmcntMax = AMDGPU::getVmcntBitMask(
IV);
1784 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(
IV);
1785 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(
IV);
1786 Limits.VscntMax =
ST->hasVscnt() ? 63 : 0;
1788 unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs();
1789 unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
1790 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1791 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1793 RegisterEncoding Encoding = {};
1794 Encoding.VGPR0 =
TRI->getEncodingValue(AMDGPU::VGPR0);
1795 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1796 Encoding.SGPR0 =
TRI->getEncodingValue(AMDGPU::SGPR0);
1797 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1799 TrackedWaitcntSet.
clear();
1813 I !=
E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
1829 std::unique_ptr<WaitcntBrackets> Brackets;
1834 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
1836 BlockInfo &BI = BII->second;
1842 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1844 *Brackets = *BI.Incoming;
1847 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1849 *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1852 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1855 if (Brackets->hasPendingEvent()) {
1856 BlockInfo *MoveBracketsToSucc =
nullptr;
1858 auto SuccBII = BlockInfos.
find(Succ);
1859 BlockInfo &SuccBI = SuccBII->second;
1860 if (!SuccBI.Incoming) {
1861 SuccBI.Dirty =
true;
1864 if (!MoveBracketsToSucc) {
1865 MoveBracketsToSucc = &SuccBI;
1867 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1869 }
else if (SuccBI.Incoming->merge(*Brackets)) {
1870 SuccBI.Dirty =
true;
1875 if (MoveBracketsToSucc)
1876 MoveBracketsToSucc->Incoming = std::move(Brackets);
1881 if (
ST->hasScalarStores()) {
1883 bool HaveScalarStores =
false;
1887 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
1888 HaveScalarStores =
true;
1890 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1891 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1896 if (HaveScalarStores) {
1906 bool SeenDCacheWB =
false;
1910 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
1911 SeenDCacheWB =
true;
1912 else if (
TII->isScalarStore(*
I))
1913 SeenDCacheWB =
false;
1916 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
1917 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static bool readsVCCZ(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Provides some synthesis utilities to produce sequences of values.
This header is deprecated in favour of llvm/TargetParser/TargetParser.h.
static const uint32_t IV[8]
bool isEntryFunction() const
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
unsigned getNumOperands() const
Retuns the total number of operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isVINTERP(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
bool getMUBUFIsBufferInv(unsigned Opc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
char & SIInsertWaitcntsID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static Waitcnt allZero(bool HasVscnt)
static constexpr bool is_iterable