19#define DEBUG_TYPE "si-fold-operands"
37 bool Commuted_ =
false,
39 UseMI(
MI), OpToFold(
nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
42 if (FoldOp->
isImm()) {
43 ImmToFold = FoldOp->
getImm();
44 }
else if (FoldOp->
isFI()) {
45 FrameIndexToFold = FoldOp->
getIndex();
66 bool needsShrink()
const {
return ShrinkOpcode != -1; }
89 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
111 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
137 "SI Fold Operands",
false,
false)
139char SIFoldOperands::
ID = 0;
148 TRI.getSubRegisterClass(RC, MO.getSubReg()))
156 case AMDGPU::V_MAC_F32_e64:
157 return AMDGPU::V_MAD_F32_e64;
158 case AMDGPU::V_MAC_F16_e64:
159 return AMDGPU::V_MAD_F16_e64;
160 case AMDGPU::V_FMAC_F32_e64:
161 return AMDGPU::V_FMA_F32_e64;
162 case AMDGPU::V_FMAC_F16_e64:
163 return AMDGPU::V_FMA_F16_gfx9_e64;
164 case AMDGPU::V_FMAC_F16_t16_e64:
165 return AMDGPU::V_FMA_F16_gfx9_e64;
166 case AMDGPU::V_FMAC_LEGACY_F32_e64:
167 return AMDGPU::V_FMA_LEGACY_F32_e64;
168 case AMDGPU::V_FMAC_F64_e64:
169 return AMDGPU::V_FMA_F64_e64;
171 return AMDGPU::INSTRUCTION_LIST_END;
178 if (!OpToFold.
isFI())
181 const unsigned Opc =
UseMI.getOpcode();
192 return OpNo == VIdx && SIdx == -1;
196 return new SIFoldOperands();
199bool SIFoldOperands::updateOperand(FoldCandidate &Fold)
const {
210 ST->hasInv2PiInlineImm())) {
213 unsigned Opcode =
MI->getOpcode();
214 int OpNo =
MI->getOperandNo(&Old);
217 ModIdx = AMDGPU::OpName::src0_modifiers;
219 ModIdx = AMDGPU::OpName::src1_modifiers;
221 ModIdx = AMDGPU::OpName::src2_modifiers;
225 unsigned Val =
Mod.getImm();
229 switch (
TII->get(Opcode).operands()[OpNo].OperandType) {
235 if (!isUInt<16>(Fold.ImmToFold)) {
236 if (!(Fold.ImmToFold & 0xffff)) {
254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
262 int Op32 = Fold.ShrinkOpcode;
267 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
270 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
274 if (HaveNonDbgCarryUse) {
287 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
288 MI->removeOperand(
I);
289 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
292 TII->commuteInstruction(*Inst32,
false);
296 assert(!Fold.needsShrink() &&
"not handled");
301 if (NewMFMAOpc == -1)
303 MI->setDesc(
TII->get(NewMFMAOpc));
304 MI->untieRegOperand(0);
310 if (Fold.isGlobal()) {
311 Old.
ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
312 Fold.OpToFold->getTargetFlags());
329 return any_of(FoldList, [&](
const auto &
C) {
return C.UseMI ==
MI; });
337 for (FoldCandidate &Fold : FoldList)
338 if (Fold.UseMI ==
MI && Fold.UseOpNo == OpNo)
340 LLVM_DEBUG(
dbgs() <<
"Append " << (Commuted ?
"commuted" :
"normal")
341 <<
" operand " << OpNo <<
"\n " << *
MI);
348 const unsigned Opc =
MI->getOpcode();
350 auto tryToFoldAsFMAAKorMK = [&]() {
351 if (!OpToFold->
isImm())
354 const bool TryAK = OpNo == 3;
355 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
356 MI->setDesc(
TII->get(NewOpc));
359 bool FoldAsFMAAKorMK =
360 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
361 if (FoldAsFMAAKorMK) {
363 MI->untieRegOperand(3);
380 MI->setDesc(
TII->get(Opc));
384 if (!
TII->isOperandLegal(*
MI, OpNo, OpToFold)) {
387 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
390 MI->setDesc(
TII->get(NewOpc));
395 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
397 MI->untieRegOperand(OpNo);
401 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
402 MI->setDesc(
TII->get(Opc));
407 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
408 if (tryToFoldAsFMAAKorMK())
413 if (OpToFold->
isImm()) {
415 if (Opc == AMDGPU::S_SETREG_B32)
416 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
417 else if (Opc == AMDGPU::S_SETREG_B32_mode)
418 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
420 MI->setDesc(
TII->get(ImmOpc));
432 unsigned CommuteOpNo = OpNo;
438 bool CanCommute =
TII->findCommutedOpIndices(*
MI, CommuteIdx0, CommuteIdx1);
441 if (CommuteIdx0 == OpNo)
442 CommuteOpNo = CommuteIdx1;
443 else if (CommuteIdx1 == OpNo)
444 CommuteOpNo = CommuteIdx0;
452 if (CanCommute && (!
MI->getOperand(CommuteIdx0).isReg() ||
453 !
MI->getOperand(CommuteIdx1).isReg()))
457 !
TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1))
460 if (!
TII->isOperandLegal(*
MI, CommuteOpNo, OpToFold)) {
461 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
462 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
463 Opc == AMDGPU::V_SUBREV_CO_U32_e64) &&
468 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
470 if (!OtherOp.
isReg() ||
477 unsigned MaybeCommutedOpc =
MI->getOpcode();
484 TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1);
494 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
495 !OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold)) {
496 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
498 if (!OpImm.
isReg() &&
499 TII->isInlineConstant(*
MI,
MI->getOperand(OpNo), OpImm))
500 return tryToFoldAsFMAAKorMK();
508 if (Opc == AMDGPU::S_FMAC_F32 &&
509 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
510 if (tryToFoldAsFMAAKorMK())
516 if (
TII->isSALU(
MI->getOpcode())) {
521 if (!OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold, OpInfo)) {
523 for (
unsigned i = 0, e = InstDesc.
getNumOperands(); i != e; ++i) {
524 auto &
Op =
MI->getOperand(i);
525 if (OpNo != i && !
Op.isReg() &&
539 return !
TII->isSDWA(
MI);
545bool SIFoldOperands::getRegSeqInit(
549 if (!Def || !
Def->isRegSequence())
552 for (
unsigned I = 1,
E =
Def->getNumExplicitOperands();
I <
E;
I += 2) {
559 SubDef =
MRI->getVRegDef(Sub->
getReg())) {
562 if (
TII->isInlineConstant(*
Op, OpTy))
566 if (!
Op->isReg() ||
Op->getReg().isPhysical())
571 Defs.emplace_back(Sub,
Def->getOperand(
I + 1).getImm());
577bool SIFoldOperands::tryToFoldACImm(
581 if (UseOpIdx >=
Desc.getNumOperands())
584 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
591 if (OpToFold.
isImm() &&
TII->isInlineConstant(OpToFold, OpTy) &&
592 TII->isOperandLegal(*
UseMI, UseOpIdx, &OpToFold)) {
597 if (!OpToFold.
isReg())
610 if (!UseOp.
getSubReg() && Def &&
TII->isFoldableCopy(*Def)) {
612 if (DefOp.
isImm() &&
TII->isInlineConstant(DefOp, OpTy) &&
613 TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
620 if (!getRegSeqInit(Defs,
UseReg, OpTy))
624 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
629 auto SubImm =
Op->getImm();
632 if (!
TII->isInlineConstant(*
Op, OpTy) ||
646void SIFoldOperands::foldOperand(
654 if (!isUseSafeToFold(*
UseMI, UseOp))
676 if (RSUse.getSubReg() != RegSeqDstSubReg)
679 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(&RSUse), FoldList,
686 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
689 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
694 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
695 MFI->getScratchRSrcReg())
701 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
721 bool FoldingImmLike =
739 if (
TRI->isSGPRClass(SrcRC) &&
TRI->hasVectorRegisters(DestRC)) {
741 for (
auto &
Use :
MRI->use_nodbg_operands(DestReg)) {
743 if (
Use.isImplicit())
747 Use.getParent()->getOperandNo(&
Use),
751 for (
auto &
F : CopyUses) {
752 foldOperand(*
F.OpToFold,
F.UseMI,
F.UseOpNo, FoldList,
757 if (DestRC == &AMDGPU::AGPR_32RegClass &&
769 unsigned MovOp =
TII->getMovOpcode(DestRC);
770 if (MovOp == AMDGPU::COPY)
776 while (ImpOpI != ImpOpE) {
815 for (
unsigned I = 0;
I <
Size / 4; ++
I) {
820 int64_t
Imm =
Def->getImm();
822 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
824 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addImm(Imm);
826 }
else if (
Def->isReg() &&
TRI->isAGPR(*
MRI,
Def->getReg())) {
828 Def->setIsKill(
false);
829 if (!SeenAGPRs.
insert(Src)) {
840 Def->setIsKill(
false);
846 if (
TRI->isSGPRReg(*
MRI, Src.Reg)) {
849 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
855 if (CopyToVGPR.
Reg) {
857 if (VGPRCopies.
count(CopyToVGPR)) {
858 Vgpr = VGPRCopies[CopyToVGPR];
860 Vgpr =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
862 VGPRCopies[CopyToVGPR] = Vgpr;
864 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
866 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addReg(Vgpr);
870 B.addImm(Defs[
I].second);
883 else if (
TRI->isVGPR(*
MRI, Reg0) &&
TRI->isAGPR(*
MRI, Reg1))
885 else if (
ST->hasGFX90AInsts() &&
TRI->isAGPR(*
MRI, Reg0) &&
892 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
893 (UseOpc == AMDGPU::V_READLANE_B32 &&
900 if (FoldingImmLike) {
909 if (OpToFold.
isImm())
942 UseDesc.
operands()[UseOpIdx].RegClass == -1)
946 if (!FoldingImmLike) {
947 if (OpToFold.
isReg() &&
ST->needsAlignedVGPRs()) {
951 if (
TRI->hasVectorRegisters(RC) && OpToFold.
getSubReg()) {
958 if (!RC || !
TRI->isProperlyAlignedRC(*RC))
962 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
991 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &ImmOp);
995 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1001 case AMDGPU::V_AND_B32_e64:
1002 case AMDGPU::V_AND_B32_e32:
1003 case AMDGPU::S_AND_B32:
1006 case AMDGPU::V_OR_B32_e64:
1007 case AMDGPU::V_OR_B32_e32:
1008 case AMDGPU::S_OR_B32:
1011 case AMDGPU::V_XOR_B32_e64:
1012 case AMDGPU::V_XOR_B32_e32:
1013 case AMDGPU::S_XOR_B32:
1016 case AMDGPU::S_XNOR_B32:
1019 case AMDGPU::S_NAND_B32:
1022 case AMDGPU::S_NOR_B32:
1025 case AMDGPU::S_ANDN2_B32:
1026 Result =
LHS & ~RHS;
1028 case AMDGPU::S_ORN2_B32:
1029 Result =
LHS | ~RHS;
1031 case AMDGPU::V_LSHL_B32_e64:
1032 case AMDGPU::V_LSHL_B32_e32:
1033 case AMDGPU::S_LSHL_B32:
1035 Result =
LHS << (
RHS & 31);
1037 case AMDGPU::V_LSHLREV_B32_e64:
1038 case AMDGPU::V_LSHLREV_B32_e32:
1039 Result =
RHS << (
LHS & 31);
1041 case AMDGPU::V_LSHR_B32_e64:
1042 case AMDGPU::V_LSHR_B32_e32:
1043 case AMDGPU::S_LSHR_B32:
1044 Result =
LHS >> (
RHS & 31);
1046 case AMDGPU::V_LSHRREV_B32_e64:
1047 case AMDGPU::V_LSHRREV_B32_e32:
1048 Result =
RHS >> (
LHS & 31);
1050 case AMDGPU::V_ASHR_I32_e64:
1051 case AMDGPU::V_ASHR_I32_e32:
1052 case AMDGPU::S_ASHR_I32:
1053 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1055 case AMDGPU::V_ASHRREV_I32_e64:
1056 case AMDGPU::V_ASHRREV_I32_e32:
1057 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1065 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1069 MI.setDesc(NewDesc);
1075 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1076 Desc.implicit_defs().size();
1078 for (
unsigned I =
MI.getNumOperands() - 1;
I >= NumOps; --
I)
1079 MI.removeOperand(
I);
1085 if (!
Op.isReg() ||
Op.getSubReg() != AMDGPU::NoSubRegister ||
1086 !
Op.getReg().isVirtual())
1090 if (Def &&
Def->isMoveImmediate()) {
1103 if (!
MI->allImplicitDefsAreDead())
1106 unsigned Opc =
MI->getOpcode();
1113 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1114 Opc == AMDGPU::S_NOT_B32) &&
1116 MI->getOperand(1).ChangeToImmediate(~Src0->
getImm());
1137 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1141 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1142 MI->removeOperand(Src1Idx);
1147 if (!
MI->isCommutable())
1155 int32_t Src1Val =
static_cast<int32_t
>(Src1->
getImm());
1156 if (Opc == AMDGPU::V_OR_B32_e64 ||
1157 Opc == AMDGPU::V_OR_B32_e32 ||
1158 Opc == AMDGPU::S_OR_B32) {
1161 MI->removeOperand(Src1Idx);
1163 }
else if (Src1Val == -1) {
1165 MI->removeOperand(Src1Idx);
1173 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1174 Opc == AMDGPU::S_AND_B32) {
1177 MI->removeOperand(Src0Idx);
1179 }
else if (Src1Val == -1) {
1181 MI->removeOperand(Src1Idx);
1189 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1190 Opc == AMDGPU::S_XOR_B32) {
1193 MI->removeOperand(Src1Idx);
1204 unsigned Opc =
MI.getOpcode();
1205 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1206 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1212 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1213 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1214 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1222 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1223 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1231 MI.removeOperand(Src2Idx);
1233 if (Src1ModIdx != -1)
1234 MI.removeOperand(Src1ModIdx);
1235 if (Src0ModIdx != -1)
1236 MI.removeOperand(Src0ModIdx);
1242bool SIFoldOperands::tryFoldZeroHighBits(
MachineInstr &
MI)
const {
1243 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1244 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1253 if (!
ST->zeroesHigh16BitsOfDest(SrcDef->
getOpcode()))
1258 MI.eraseFromParent();
1270 bool Changed =
false;
1272 if (OpToFold.
isImm()) {
1283 if (tryConstantFoldOp(&
UseMI)) {
1291 for (
auto &
Use :
MRI->use_nodbg_operands(Dst.getReg()))
1293 for (
auto *U : UsesToProcess) {
1299 if (CopiesToReplace.
empty() && FoldList.
empty())
1305 Copy->addImplicitDefUseOperands(*MF);
1307 for (FoldCandidate &Fold : FoldList) {
1308 assert(!Fold.isReg() || Fold.OpToFold);
1309 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1319 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1323 MRI->clearKillFlags(Fold.OpToFold->getReg());
1326 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1328 }
else if (Fold.Commuted) {
1330 TII->commuteInstruction(*Fold.UseMI,
false);
1336bool SIFoldOperands::tryFoldFoldableCopy(
1340 if (
MI.getOperand(0).getReg() == AMDGPU::M0) {
1342 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1343 MI.eraseFromParent();
1358 if (!FoldingImm && !OpToFold.
isReg())
1370 if (!
MI.getOperand(0).getReg().isVirtual())
1373 bool Changed = foldInstOperand(
MI, OpToFold);
1380 auto *InstToErase = &
MI;
1381 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1382 auto &
SrcOp = InstToErase->getOperand(1);
1384 InstToErase->eraseFromParent();
1386 InstToErase =
nullptr;
1389 InstToErase =
MRI->getVRegDef(SrcReg);
1390 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
1394 if (InstToErase && InstToErase->isRegSequence() &&
1395 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1396 InstToErase->eraseFromParent();
1406 unsigned Op =
MI.getOpcode();
1408 case AMDGPU::V_MAX_F32_e64:
1409 case AMDGPU::V_MAX_F16_e64:
1410 case AMDGPU::V_MAX_F16_t16_e64:
1411 case AMDGPU::V_MAX_F16_fake16_e64:
1412 case AMDGPU::V_MAX_F64_e64:
1413 case AMDGPU::V_PK_MAX_F16: {
1414 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
1423 Src0->
getSubReg() != AMDGPU::NoSubRegister)
1427 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1431 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
1433 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
1439 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1451 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
1457 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
1464 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
1468 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1469 MI.eraseFromParent();
1474 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1475 Def->eraseFromParent();
1482 case AMDGPU::V_MUL_F64_e64: {
1484 case 0x3fe0000000000000:
1486 case 0x4000000000000000:
1488 case 0x4010000000000000:
1494 case AMDGPU::V_MUL_F32_e64: {
1495 switch (
static_cast<uint32_t>(Val)) {
1506 case AMDGPU::V_MUL_F16_e64:
1507 case AMDGPU::V_MUL_F16_t16_e64:
1508 case AMDGPU::V_MUL_F16_fake16_e64: {
1509 switch (
static_cast<uint16_t>(Val)) {
1528std::pair<const MachineOperand *, int>
1530 unsigned Op =
MI.getOpcode();
1532 case AMDGPU::V_MUL_F64_e64:
1533 case AMDGPU::V_MUL_F32_e64:
1534 case AMDGPU::V_MUL_F16_t16_e64:
1535 case AMDGPU::V_MUL_F16_fake16_e64:
1536 case AMDGPU::V_MUL_F16_e64: {
1538 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
1540 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F16_e64 ||
1541 Op == AMDGPU::V_MUL_F16_t16_e64 ||
1542 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1550 if (Src0->
isImm()) {
1553 }
else if (Src1->
isImm()) {
1561 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
1562 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
1563 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
1564 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
1567 return std::pair(RegOp, OMod);
1569 case AMDGPU::V_ADD_F64_e64:
1570 case AMDGPU::V_ADD_F32_e64:
1571 case AMDGPU::V_ADD_F16_e64:
1572 case AMDGPU::V_ADD_F16_t16_e64:
1573 case AMDGPU::V_ADD_F16_fake16_e64: {
1575 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
1577 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F16_e64 ||
1578 Op == AMDGPU::V_ADD_F16_t16_e64 ||
1579 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1589 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
1590 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
1591 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
1592 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1606 std::tie(RegOp, OMod) = isOMod(
MI);
1608 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
1609 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
1619 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1625 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1626 MI.eraseFromParent();
1631 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1632 Def->eraseFromParent();
1641 auto Reg =
MI.getOperand(0).getReg();
1643 if (!
ST->hasGFX90AInsts() || !
TRI->isVGPR(*
MRI, Reg) ||
1644 !
MRI->hasOneNonDBGUse(Reg))
1651 for (
auto &Def : Defs) {
1652 const auto *
Op =
Def.first;
1655 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
1669 if (!
TRI->isVGPR(*
MRI, Reg) || !
MRI->hasOneNonDBGUse(Reg))
1671 Op = &*
MRI->use_nodbg_begin(Reg);
1675 if (
Op->getSubReg())
1681 TII->getRegClass(InstDesc, OpIdx,
TRI, *
MI.getMF());
1682 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
1685 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg));
1686 auto Dst =
MRI->createVirtualRegister(NewDstRC);
1688 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1690 for (
unsigned I = 0;
I < Defs.size(); ++
I) {
1692 Def->setIsKill(
false);
1700 RS.addImm(Defs[
I].second);
1704 if (!
TII->isOperandLegal(*
UseMI, OpIdx,
Op)) {
1706 RS->eraseFromParent();
1714 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
1715 MI.eraseFromParent();
1723 Register &OutReg,
unsigned &OutSubReg) {
1733 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
1734 OutReg = CopySrcReg;
1743 if (!CopySrcDef || !CopySrcDef->
isCopy())
1750 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
1751 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
1754 OutReg = OtherCopySrcReg;
1792 if (!
TRI->isVGPR(*
MRI, PhiOut))
1798 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1801 if (!Copy || !
Copy->isCopy())
1805 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1810 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1821 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1825 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1833 unsigned CopyOpc = AMDGPU::COPY;
1838 if (
Def->isCopy()) {
1840 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1854 if (IsAGPR32 && !
ST->hasGFX90AInsts() && !
MRI->hasOneNonDBGUse(Reg) &&
1856 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1859 InsertMBB =
Def->getParent();
1866 Register NewReg =
MRI->createVirtualRegister(ARC);
1868 TII->get(CopyOpc), NewReg)
1877 Register NewReg =
MRI->createVirtualRegister(ARC);
1878 PHI.getOperand(0).setReg(NewReg);
1884 TII->get(AMDGPU::COPY), PhiOut)
1894 if (!
ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
1915 while (!
Users.empty()) {
1917 if (!
I->isCopy() && !
I->isRegSequence())
1919 Register DstReg =
I->getOperand(0).getReg();
1923 if (
TRI->isAGPR(*
MRI, DstReg))
1927 Users.push_back(&U);
1931 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
1932 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
1933 MRI->setRegClass(DefReg, RC);
1937 while (!MoveRegs.
empty()) {
1939 MRI->setRegClass(Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg)));
1982 if (
ST->hasGFX90AInsts())
1989 for (
auto &
MI :
MBB) {
1993 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
1996 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2004 bool Changed =
false;
2005 for (
const auto &[Entry, MOs] : RegToMO) {
2006 if (MOs.size() == 1)
2017 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2020 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2024 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2026 TII->get(AMDGPU::COPY), TempAGPR)
2048 TII =
ST->getInstrInfo();
2049 TRI = &
TII->getRegisterInfo();
2057 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2059 bool Changed =
false;
2063 Changed |= tryFoldCndMask(
MI);
2065 if (tryFoldZeroHighBits(
MI)) {
2070 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2075 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2080 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2085 if (
TII->isFoldableCopy(
MI)) {
2086 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2091 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2092 CurrentKnownM0Val =
nullptr;
2098 Changed |= tryFoldClamp(
MI);
2101 Changed |= tryOptimizeAGPRPhis(*
MBB);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
const MachineOperand & getOperand(unsigned i) const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_LAST
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_AC_FIRST
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_C_FIRST
@ OPERAND_REG_INLINE_AC_LAST
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.