23#define DEBUG_TYPE "si-fold-operands"
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
49 FoldableDef() =
delete;
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.
getType()) {
55 ImmToFold = FoldOp.
getImm();
56 }
else if (FoldOp.
isFI()) {
57 FrameIndexToFold = FoldOp.
getIndex();
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
73 FoldableDef Copy(*
this);
74 Copy.DefSubReg =
TRI.composeSubRegIndices(DefSubReg,
SubReg);
82 return OpToFold->getReg();
85 unsigned getSubReg()
const {
87 return OpToFold->getSubReg();
98 return FrameIndexToFold;
106 std::optional<int64_t> getEffectiveImmVal()
const {
114 unsigned OpIdx)
const {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
127 if (DefSubReg != AMDGPU::NoSubRegister)
135 if (DefSubReg != AMDGPU::NoSubRegister)
137 return TII.isOperandLegal(
MI,
OpIdx, OpToFold);
144struct FoldCandidate {
152 bool Commuted =
false,
int ShrinkOp = -1)
153 :
UseMI(
MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
156 bool isFI()
const {
return Def.isFI(); }
160 return Def.FrameIndexToFold;
163 bool isImm()
const {
return Def.isImm(); }
165 bool isReg()
const {
return Def.isReg(); }
169 bool isGlobal()
const {
return Def.isGlobal(); }
171 bool needsShrink()
const {
return ShrinkOpcode != -1; }
174class SIFoldOperandsImpl {
184 const FoldableDef &OpToFold)
const;
187 unsigned convertToVALUOp(
unsigned Opc,
bool UseVOP3 =
false)
const {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
201 return AMDGPU::INSTRUCTION_LIST_END;
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(
Register DstReg,
Register SrcReg,
211 int64_t ImmVal)
const;
215 int64_t ImmVal)
const;
219 const FoldableDef &OpToFold)
const;
228 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
231 std::pair<int64_t, const TargetRegisterClass *>
249 bool foldInstOperand(
MachineInstr &
MI,
const FoldableDef &OpToFold)
const;
251 bool foldCopyToAGPRRegSequence(
MachineInstr *CopyMI)
const;
258 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
267 SIFoldOperandsImpl() =
default;
281 return SIFoldOperandsImpl().run(MF);
284 StringRef getPassName()
const override {
return "SI Fold Operands"; }
301char SIFoldOperandsLegacy::
ID = 0;
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
335 return AMDGPU::INSTRUCTION_LIST_END;
341 const FoldableDef &OpToFold)
const {
342 if (!OpToFold.isFI())
345 const unsigned Opc =
UseMI.getOpcode();
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
365 return OpNo == AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
369 int SIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
373 int VIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
382 if (
TRI->isVGPR(*
MRI, DstReg) &&
TRI->isSGPRReg(*
MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *
Def =
MRI->getVRegDef(SrcReg);
385 if (!Def ||
Def->getNumOperands() != 4)
388 MachineOperand *Src0 = &
Def->getOperand(1);
389 MachineOperand *Src1 = &
Def->getOperand(2);
400 const bool UseVOP3 = !Src0->
isImm() ||
TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(
Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !
Def->getOperand(3).isDead())
406 MachineBasicBlock *
MBB =
Def->getParent();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder
Add =
412 if (
Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
415 MRI->setRegAllocationHint(CarryOutReg, 0,
TRI->getVCC());
418 Add.add(*Src0).add(*Src1).setMIFlags(
Def->getFlags());
422 Def->eraseFromParent();
423 MI.eraseFromParent();
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
438 Def->eraseFromParent();
439 MI.eraseFromParent();
448 return new SIFoldOperandsLegacy();
451bool SIFoldOperandsImpl::canUseImmWithOpSel(
const MachineInstr *
MI,
453 int64_t ImmVal)
const {
454 const uint64_t TSFlags =
MI->getDesc().TSFlags;
462 int OpNo =
MI->getOperandNo(&Old);
464 unsigned Opcode =
MI->getOpcode();
465 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *
MI,
unsigned UseOpNo,
488 int64_t ImmVal)
const {
489 MachineOperand &Old =
MI->getOperand(UseOpNo);
490 unsigned Opcode =
MI->getOpcode();
491 int OpNo =
MI->getOperandNo(&Old);
492 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
509 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
512 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &
Mod =
MI->getOperand(ModIdx);
519 unsigned ModVal =
Mod.getImm();
525 uint32_t
Imm = (
static_cast<uint32_t
>(ImmHi) << 16) | ImmLo;
530 auto tryFoldToInline = [&](uint32_t
Imm) ->
bool {
539 uint16_t
Lo =
static_cast<uint16_t
>(
Imm);
540 uint16_t
Hi =
static_cast<uint16_t
>(
Imm >> 16);
543 Mod.setImm(NewModVal);
548 if (
static_cast<int16_t
>(
Lo) < 0) {
549 int32_t SExt =
static_cast<int16_t
>(
Lo);
551 Mod.setImm(NewModVal);
566 uint32_t Swapped = (
static_cast<uint32_t
>(
Lo) << 16) |
Hi;
577 if (tryFoldToInline(Imm))
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
594 uint16_t NegLo = -
static_cast<uint16_t
>(
Imm);
595 uint16_t NegHi = -
static_cast<uint16_t
>(
Imm >> 16);
596 uint32_t NegImm = (
static_cast<uint32_t
>(NegHi) << 16) | NegLo;
598 if (tryFoldToInline(NegImm)) {
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(
TII->get(NegOpcode));
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold)
const {
611 MachineInstr *
MI = Fold.UseMI;
612 MachineOperand &Old =
MI->getOperand(Fold.UseOpNo);
615 std::optional<int64_t> ImmVal;
617 ImmVal = Fold.Def.getEffectiveImmVal();
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
626 int OpNo =
MI->getOperandNo(&Old);
627 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *
MBB =
MI->getParent();
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 =
MI->getOperand(0);
643 MachineOperand &Dst1 =
MI->getOperand(1);
646 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
648 const TargetRegisterClass *Dst0RC =
MRI->getRegClass(Dst0.
getReg());
649 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
651 MachineInstr *Inst32 =
TII->buildShrunkInst(*
MI, Op32);
653 if (HaveNonDbgCarryUse) {
666 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
667 MI->removeOperand(
I);
668 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
671 TII->commuteInstruction(*Inst32,
false);
675 assert(!Fold.needsShrink() &&
"not handled");
680 if (NewMFMAOpc == -1)
682 MI->setDesc(
TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
688 int OpNo =
MI->getOperandNo(&Old);
689 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
696 if (Fold.isGlobal()) {
697 Old.
ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
708 MachineOperand *
New = Fold.Def.OpToFold;
711 if (
const TargetRegisterClass *OpRC =
712 TII->getRegClass(
MI->getDesc(), Fold.UseOpNo,
TRI)) {
713 const TargetRegisterClass *NewRC =
714 TRI->getRegClassForReg(*
MRI,
New->getReg());
716 const TargetRegisterClass *ConstrainRC = OpRC;
717 if (
New->getSubReg()) {
719 TRI->getMatchingSuperRegClass(NewRC, OpRC,
New->getSubReg());
725 if (
New->getReg().isVirtual() &&
726 !
MRI->constrainRegClass(
New->getReg(), ConstrainRC)) {
728 <<
TRI->getRegClassName(ConstrainRC) <<
'\n');
737 if (
New->getReg().isPhysical()) {
747 FoldCandidate &&Entry) {
749 for (FoldCandidate &Fold : FoldList)
750 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
752 LLVM_DEBUG(
dbgs() <<
"Append " << (Entry.Commuted ?
"commuted" :
"normal")
753 <<
" operand " << Entry.UseOpNo <<
"\n " << *Entry.UseMI);
759 const FoldableDef &FoldOp,
760 bool Commuted =
false,
int ShrinkOp = -1) {
762 FoldCandidate(
MI, OpNo, FoldOp, Commuted, ShrinkOp));
765bool SIFoldOperandsImpl::tryAddToFoldList(
766 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *
MI,
unsigned OpNo,
767 const FoldableDef &OpToFold)
const {
768 const unsigned Opc =
MI->getOpcode();
770 auto tryToFoldAsFMAAKorMK = [&]() {
771 if (!OpToFold.isImm())
774 const bool TryAK = OpNo == 3;
775 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
776 MI->setDesc(
TII->get(NewOpc));
779 bool FoldAsFMAAKorMK =
780 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
781 if (FoldAsFMAAKorMK) {
783 MI->untieRegOperand(3);
786 MachineOperand &Op1 =
MI->getOperand(1);
787 MachineOperand &Op2 =
MI->getOperand(2);
804 bool IsLegal = OpToFold.isOperandLegal(*
TII, *
MI, OpNo);
805 if (!IsLegal && OpToFold.isImm()) {
806 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
807 IsLegal = canUseImmWithOpSel(
MI, OpNo, *ImmVal);
813 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
816 MI->setDesc(
TII->get(NewOpc));
821 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
823 MI->untieRegOperand(OpNo);
827 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
833 if (
Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
834 if (tryToFoldAsFMAAKorMK())
839 if (OpToFold.isImm()) {
841 if (
Opc == AMDGPU::S_SETREG_B32)
842 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
843 else if (
Opc == AMDGPU::S_SETREG_B32_mode)
844 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
846 MI->setDesc(
TII->get(ImmOpc));
855 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
859 MachineOperand &
Op =
MI->getOperand(OpNo);
860 MachineOperand &CommutedOp =
MI->getOperand(CommuteOpNo);
866 if (!
Op.isReg() || !CommutedOp.
isReg())
871 if (
Op.isReg() && CommutedOp.
isReg() &&
872 (
Op.getReg() == CommutedOp.
getReg() &&
876 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
880 if (!OpToFold.isOperandLegal(*
TII, *
MI, CommuteOpNo)) {
881 if ((
Opc != AMDGPU::V_ADD_CO_U32_e64 &&
Opc != AMDGPU::V_SUB_CO_U32_e64 &&
882 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
883 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
884 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
890 MachineOperand &OtherOp =
MI->getOperand(OpNo);
891 if (!OtherOp.
isReg() ||
898 unsigned MaybeCommutedOpc =
MI->getOpcode();
912 if (
Opc == AMDGPU::S_FMAC_F32 &&
913 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
914 if (tryToFoldAsFMAAKorMK())
922bool SIFoldOperandsImpl::isUseSafeToFold(
const MachineInstr &
MI,
923 const MachineOperand &UseMO)
const {
925 return !
TII->isSDWA(
MI);
933 SubDef &&
TII.isFoldableCopy(*SubDef);
934 SubDef =
MRI.getVRegDef(
Sub->getReg())) {
935 unsigned SrcIdx =
TII.getFoldableCopySrcIdx(*SubDef);
944 if (
SrcOp.getSubReg())
951const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
952 MachineInstr &RegSeq,
953 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs)
const {
957 const TargetRegisterClass *RC =
nullptr;
967 else if (!
TRI->getCommonSubClass(RC, OpRC))
972 Defs.emplace_back(&SrcOp, SubRegIdx);
977 if (DefSrc && (DefSrc->
isReg() || DefSrc->
isImm())) {
978 Defs.emplace_back(DefSrc, SubRegIdx);
982 Defs.emplace_back(&SrcOp, SubRegIdx);
991const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
992 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
995 if (!Def || !
Def->isRegSequence())
998 return getRegSeqInit(*Def, Defs);
1001std::pair<int64_t, const TargetRegisterClass *>
1002SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq)
const {
1004 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1008 bool TryToMatchSplat64 =
false;
1011 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
1012 const MachineOperand *
Op = Defs[
I].first;
1016 int64_t SubImm =
Op->getImm();
1022 if (Imm != SubImm) {
1023 if (
I == 1 && (
E & 1) == 0) {
1026 TryToMatchSplat64 =
true;
1034 if (!TryToMatchSplat64)
1035 return {Defs[0].first->getImm(), SrcRC};
1040 for (
unsigned I = 0,
E = Defs.
size();
I !=
E;
I += 2) {
1041 const MachineOperand *Op0 = Defs[
I].first;
1042 const MachineOperand *Op1 = Defs[
I + 1].first;
1047 unsigned SubReg0 = Defs[
I].second;
1048 unsigned SubReg1 = Defs[
I + 1].second;
1052 if (
TRI->getChannelFromSubReg(SubReg0) + 1 !=
1053 TRI->getChannelFromSubReg(SubReg1))
1058 SplatVal64 = MergedVal;
1059 else if (SplatVal64 != MergedVal)
1063 const TargetRegisterClass *RC64 =
TRI->getSubRegisterClass(
1066 return {SplatVal64, RC64};
1069bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1070 MachineInstr *
UseMI,
unsigned UseOpIdx, int64_t SplatVal,
1071 const TargetRegisterClass *SplatRC)
const {
1073 if (UseOpIdx >=
Desc.getNumOperands())
1080 int16_t RCID =
TII->getOpRegClassID(
Desc.operands()[UseOpIdx]);
1084 const TargetRegisterClass *OpRC =
TRI->getRegClass(RCID);
1089 if (SplatVal != 0 && SplatVal != -1) {
1093 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
1099 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1104 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1110 if (!
TRI->getCommonSubClass(OpRC, SplatRC))
1115 if (!
TII->isOperandLegal(*
UseMI, UseOpIdx, &TmpOp))
1121bool SIFoldOperandsImpl::tryToFoldACImm(
1122 const FoldableDef &OpToFold, MachineInstr *
UseMI,
unsigned UseOpIdx,
1123 SmallVectorImpl<FoldCandidate> &FoldList)
const {
1125 if (UseOpIdx >=
Desc.getNumOperands())
1133 if (OpToFold.isImm() && OpToFold.isOperandLegal(*
TII, *
UseMI, UseOpIdx)) {
1143 if (!OpToFold.isReg())
1156 if (Def &&
TII->isFoldableCopy(*Def)) {
1157 MachineOperand &DefOp =
Def->getOperand(1);
1158 if (DefOp.
isImm() &&
TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
1159 FoldableDef FoldableImm(DefOp.
getImm(), OpToFold.DefRC,
1160 OpToFold.DefSubReg);
1169void SIFoldOperandsImpl::foldOperand(
1170 FoldableDef OpToFold, MachineInstr *
UseMI,
int UseOpIdx,
1171 SmallVectorImpl<FoldCandidate> &FoldList,
1172 SmallVectorImpl<MachineInstr *> &CopiesToReplace)
const {
1175 if (!isUseSafeToFold(*
UseMI, *UseOp))
1179 if (UseOp->
isReg() && OpToFold.isReg()) {
1183 if (UseOp->
getSubReg() != AMDGPU::NoSubRegister &&
1185 !
TRI->isSGPRReg(*
MRI, OpToFold.getReg())))
1197 const TargetRegisterClass *SplatRC;
1198 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*
UseMI);
1203 for (
unsigned I = 0;
I != UsesToProcess.size(); ++
I) {
1204 MachineOperand *RSUse = UsesToProcess[
I];
1205 MachineInstr *RSUseMI = RSUse->
getParent();
1215 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1216 FoldableDef SplatDef(SplatVal, SplatRC);
1223 if (RSUse->
getSubReg() != RegSeqDstSubReg)
1228 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
1235 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
1238 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
1243 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1249 MachineOperand &SOff =
1250 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
1261 TII->getNamedOperand(*
UseMI, AMDGPU::OpName::cpol)->getImm();
1276 bool FoldingImmLike =
1277 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1285 const TargetRegisterClass *SrcRC =
MRI->getRegClass(SrcReg);
1293 const TargetRegisterClass *DestRC =
TRI->getRegClassForReg(*
MRI, DestReg);
1296 for (
unsigned MovOp :
1297 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1298 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1299 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1300 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1301 const MCInstrDesc &MovDesc =
TII->get(MovOp);
1302 const TargetRegisterClass *MovDstRC =
1311 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1312 const TargetRegisterClass *MovSrcRC =
1313 TRI->getRegClass(
TII->getOpRegClassID(MovDesc.
operands()[SrcIdx]));
1317 MovSrcRC =
TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1321 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1322 (!OpToFold.isImm() ||
1323 !
TII->isImmOperandLegal(MovDesc, SrcIdx,
1324 *OpToFold.getEffectiveImmVal())))
1327 if (!
MRI->constrainRegClass(SrcReg, MovSrcRC))
1337 if (!OpToFold.isImm() ||
1338 !
TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1344 while (ImpOpI != ImpOpE) {
1351 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1353 MachineOperand NewSrcOp(SrcOp);
1375 LLVM_DEBUG(
dbgs() <<
"Folding " << OpToFold.OpToFold <<
"\n into "
1380 unsigned SubRegIdx = OpToFold.getSubReg();
1394 static_assert(AMDGPU::sub1_hi16 == 12,
"Subregister layout has changed");
1399 if (SubRegIdx > AMDGPU::sub1) {
1400 LaneBitmask
M =
TRI->getSubRegIndexLaneMask(SubRegIdx);
1401 M |=
M.getLane(
M.getHighestLane() - 1);
1402 SmallVector<unsigned, 4> Indexes;
1403 TRI->getCoveringSubRegIndexes(
TRI->getRegClassForReg(*
MRI,
UseReg), M,
1405 assert(Indexes.
size() == 1 &&
"Expected one 32-bit subreg to cover");
1406 SubRegIdx = Indexes[0];
1408 }
else if (
TII->getOpSize(*
UseMI, 1) == 4)
1411 SubRegIdx = AMDGPU::sub0;
1416 OpToFold.OpToFold->setIsKill(
false);
1420 if (foldCopyToAGPRRegSequence(
UseMI))
1425 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1426 (UseOpc == AMDGPU::V_READLANE_B32 &&
1428 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1433 if (FoldingImmLike) {
1436 *OpToFold.DefMI, *
UseMI))
1441 if (OpToFold.isImm()) {
1443 *OpToFold.getEffectiveImmVal());
1444 }
else if (OpToFold.isFI())
1447 assert(OpToFold.isGlobal());
1449 OpToFold.OpToFold->getOffset(),
1450 OpToFold.OpToFold->getTargetFlags());
1456 if (OpToFold.isReg() &&
TRI->isSGPRReg(*
MRI, OpToFold.getReg())) {
1459 *OpToFold.DefMI, *
UseMI))
1480 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1488 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, OpToFold);
1494 case AMDGPU::V_AND_B32_e64:
1495 case AMDGPU::V_AND_B32_e32:
1496 case AMDGPU::S_AND_B32:
1499 case AMDGPU::V_OR_B32_e64:
1500 case AMDGPU::V_OR_B32_e32:
1501 case AMDGPU::S_OR_B32:
1504 case AMDGPU::V_XOR_B32_e64:
1505 case AMDGPU::V_XOR_B32_e32:
1506 case AMDGPU::S_XOR_B32:
1509 case AMDGPU::S_XNOR_B32:
1512 case AMDGPU::S_NAND_B32:
1515 case AMDGPU::S_NOR_B32:
1518 case AMDGPU::S_ANDN2_B32:
1521 case AMDGPU::S_ORN2_B32:
1524 case AMDGPU::V_LSHL_B32_e64:
1525 case AMDGPU::V_LSHL_B32_e32:
1526 case AMDGPU::S_LSHL_B32:
1528 Result =
LHS << (
RHS & 31);
1530 case AMDGPU::V_LSHLREV_B32_e64:
1531 case AMDGPU::V_LSHLREV_B32_e32:
1532 Result =
RHS << (
LHS & 31);
1534 case AMDGPU::V_LSHR_B32_e64:
1535 case AMDGPU::V_LSHR_B32_e32:
1536 case AMDGPU::S_LSHR_B32:
1537 Result =
LHS >> (
RHS & 31);
1539 case AMDGPU::V_LSHRREV_B32_e64:
1540 case AMDGPU::V_LSHRREV_B32_e32:
1541 Result =
RHS >> (
LHS & 31);
1543 case AMDGPU::V_ASHR_I32_e64:
1544 case AMDGPU::V_ASHR_I32_e32:
1545 case AMDGPU::S_ASHR_I32:
1546 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1548 case AMDGPU::V_ASHRREV_I32_e64:
1549 case AMDGPU::V_ASHRREV_I32_e32:
1550 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1558 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1562 MI.setDesc(NewDesc);
1568 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1569 Desc.implicit_defs().size();
1571 for (
unsigned I =
MI.getNumOperands() - 1;
I >=
NumOps; --
I)
1572 MI.removeOperand(
I);
1575std::optional<int64_t>
1576SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &
Op)
const {
1580 if (!
Op.isReg() || !
Op.getReg().isVirtual())
1581 return std::nullopt;
1583 const MachineInstr *
Def =
MRI->getVRegDef(
Op.getReg());
1584 if (Def &&
Def->isMoveImmediate()) {
1585 const MachineOperand &ImmSrc =
Def->getOperand(1);
1587 return TII->extractSubregFromImm(ImmSrc.
getImm(),
Op.getSubReg());
1590 return std::nullopt;
1596bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *
MI)
const {
1597 if (!
MI->allImplicitDefsAreDead())
1600 unsigned Opc =
MI->getOpcode();
1602 int Src0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0);
1606 MachineOperand *Src0 = &
MI->getOperand(Src0Idx);
1607 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1609 if ((
Opc == AMDGPU::V_NOT_B32_e64 ||
Opc == AMDGPU::V_NOT_B32_e32 ||
1610 Opc == AMDGPU::S_NOT_B32) &&
1612 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1617 int Src1Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1);
1621 MachineOperand *Src1 = &
MI->getOperand(Src1Idx);
1622 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1624 if (!Src0Imm && !Src1Imm)
1630 if (Src0Imm && Src1Imm) {
1635 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1639 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1640 MI->removeOperand(Src1Idx);
1645 if (!
MI->isCommutable())
1648 if (Src0Imm && !Src1Imm) {
1654 int32_t Src1Val =
static_cast<int32_t
>(*Src1Imm);
1655 if (
Opc == AMDGPU::V_OR_B32_e64 ||
1656 Opc == AMDGPU::V_OR_B32_e32 ||
1657 Opc == AMDGPU::S_OR_B32) {
1660 MI->removeOperand(Src1Idx);
1662 }
else if (Src1Val == -1) {
1664 MI->removeOperand(Src1Idx);
1672 if (
Opc == AMDGPU::V_AND_B32_e64 ||
Opc == AMDGPU::V_AND_B32_e32 ||
1673 Opc == AMDGPU::S_AND_B32) {
1676 MI->removeOperand(Src0Idx);
1678 }
else if (Src1Val == -1) {
1680 MI->removeOperand(Src1Idx);
1688 if (
Opc == AMDGPU::V_XOR_B32_e64 ||
Opc == AMDGPU::V_XOR_B32_e32 ||
1689 Opc == AMDGPU::S_XOR_B32) {
1692 MI->removeOperand(Src1Idx);
1702bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &
MI)
const {
1703 unsigned Opc =
MI.getOpcode();
1704 if (
Opc != AMDGPU::V_CNDMASK_B32_e32 &&
Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1705 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1708 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1709 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1711 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1715 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1716 if (!Src0Imm || *Src0Imm != *Src1Imm)
1721 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1_modifiers);
1723 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0_modifiers);
1724 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1725 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1731 int Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
1733 MI.removeOperand(Src2Idx);
1734 MI.removeOperand(AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1));
1735 if (Src1ModIdx != -1)
1736 MI.removeOperand(Src1ModIdx);
1737 if (Src0ModIdx != -1)
1738 MI.removeOperand(Src0ModIdx);
1744bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &
MI)
const {
1745 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1746 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1749 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(
MI.getOperand(1));
1750 if (!Src0Imm || *Src0Imm != 0xffff || !
MI.getOperand(2).isReg())
1754 MachineInstr *SrcDef =
MRI->getVRegDef(Src1);
1759 MRI->replaceRegWith(Dst, Src1);
1760 if (!
MI.getOperand(2).isKill())
1761 MRI->clearKillFlags(Src1);
1762 MI.eraseFromParent();
1766bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &
MI,
1767 const FoldableDef &OpToFold)
const {
1771 SmallVector<MachineInstr *, 4> CopiesToReplace;
1773 MachineOperand &Dst =
MI.getOperand(0);
1776 if (OpToFold.isImm()) {
1787 if (tryConstantFoldOp(&
UseMI)) {
1796 for (
auto *U : UsesToProcess) {
1797 MachineInstr *
UseMI =
U->getParent();
1799 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*
TRI,
U->getSubReg());
1804 if (CopiesToReplace.
empty() && FoldList.
empty())
1807 MachineFunction *MF =
MI.getParent()->getParent();
1809 for (MachineInstr *Copy : CopiesToReplace)
1810 Copy->addImplicitDefUseOperands(*MF);
1812 SetVector<MachineInstr *> ConstantFoldCandidates;
1813 for (FoldCandidate &Fold : FoldList) {
1814 assert(!Fold.isReg() || Fold.Def.OpToFold);
1815 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1817 const MachineInstr *
DefMI = Fold.Def.DefMI;
1825 assert(Fold.Def.OpToFold && Fold.isReg());
1829 MRI->clearKillFlags(Fold.getReg());
1832 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1836 ConstantFoldCandidates.
insert(Fold.UseMI);
1838 }
else if (Fold.Commuted) {
1840 TII->commuteInstruction(*Fold.UseMI,
false);
1844 for (MachineInstr *
MI : ConstantFoldCandidates) {
1845 if (tryConstantFoldOp(
MI)) {
1855bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI)
const {
1860 const TargetRegisterClass *DefRC =
1862 if (!
TRI->isAGPRClass(DefRC))
1866 MachineInstr *RegSeq =
MRI->getVRegDef(
UseReg);
1874 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1876 const TargetRegisterClass *UseRC =
1883 unsigned NumFoldable = 0;
1885 for (
unsigned I = 1;
I != NumRegSeqOperands;
I += 2) {
1901 const TargetRegisterClass *DestSuperRC =
TRI->getMatchingSuperRegClass(
1902 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1911 const TargetRegisterClass *InputRC =
1921 const TargetRegisterClass *MatchRC =
1922 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1933 if (NumFoldable == 0)
1936 CopyMI->
setDesc(
TII->get(AMDGPU::REG_SEQUENCE));
1940 for (
auto [Def, DestSubIdx] : NewDefs) {
1941 if (!
Def->isReg()) {
1944 Register Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1945 BuildMI(
MBB, CopyMI,
DL,
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1950 Def->setIsKill(
false);
1952 Register &VGPRCopy = VGPRCopies[Src];
1954 const TargetRegisterClass *VGPRUseSubRC =
1955 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1964 const TargetRegisterClass *SubRC =
1965 TRI->getSubRegisterClass(
MRI->getRegClass(Src.Reg), Src.SubReg);
1968 VGPRCopy =
MRI->createVirtualRegister(VGPRUseSubRC);
1980 B.addImm(DestSubIdx);
1987bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1988 MachineInstr &
MI, MachineOperand *&CurrentKnownM0Val)
const {
1992 if (DstReg == AMDGPU::M0) {
1993 MachineOperand &NewM0Val =
MI.getOperand(1);
1994 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1995 MI.eraseFromParent();
2006 MachineOperand *OpToFoldPtr;
2007 if (
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2009 if (
TII->hasAnyModifiersSet(
MI))
2011 OpToFoldPtr = &
MI.getOperand(2);
2013 OpToFoldPtr = &
MI.getOperand(1);
2014 MachineOperand &OpToFold = *OpToFoldPtr;
2018 if (!FoldingImm && !OpToFold.
isReg())
2023 !
TRI->isConstantPhysReg(OpToFold.
getReg()))
2035 const TargetRegisterClass *DstRC =
2036 MRI->getRegClass(
MI.getOperand(0).getReg());
2052 if (
MI.getOpcode() == AMDGPU::COPY && OpToFold.
isReg() &&
2054 if (DstRC == &AMDGPU::SReg_32RegClass &&
2055 DstRC ==
MRI->getRegClass(OpToFold.
getReg())) {
2063 if (OpToFold.
isReg() &&
MI.isCopy() && !
MI.getOperand(1).getSubReg()) {
2064 if (foldCopyToAGPRRegSequence(&
MI))
2068 FoldableDef
Def(OpToFold, DstRC);
2069 bool Changed = foldInstOperand(
MI, Def);
2076 auto *InstToErase = &
MI;
2077 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2078 auto &SrcOp = InstToErase->getOperand(1);
2080 InstToErase->eraseFromParent();
2082 InstToErase =
nullptr;
2085 InstToErase =
MRI->getVRegDef(SrcReg);
2086 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
2090 if (InstToErase && InstToErase->isRegSequence() &&
2091 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2092 InstToErase->eraseFromParent();
2102 return OpToFold.
isReg() &&
2103 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.
getReg(),
MI);
2108const MachineOperand *
2109SIFoldOperandsImpl::isClamp(
const MachineInstr &
MI)
const {
2110 unsigned Op =
MI.getOpcode();
2112 case AMDGPU::V_MAX_F32_e64:
2113 case AMDGPU::V_MAX_F16_e64:
2114 case AMDGPU::V_MAX_F16_t16_e64:
2115 case AMDGPU::V_MAX_F16_fake16_e64:
2116 case AMDGPU::V_MAX_F64_e64:
2117 case AMDGPU::V_MAX_NUM_F64_e64:
2118 case AMDGPU::V_PK_MAX_F16:
2119 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2120 case AMDGPU::V_PK_MAX_NUM_BF16: {
2121 if (
MI.mayRaiseFPException())
2124 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
2128 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2129 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2133 Src0->
getSubReg() != AMDGPU::NoSubRegister)
2137 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2141 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
2143 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
2147 unsigned UnsetMods =
2148 (
Op == AMDGPU::V_PK_MAX_F16 ||
Op == AMDGPU::V_PK_MAX_NUM_BF16)
2151 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2161bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &
MI) {
2162 const MachineOperand *ClampSrc = isClamp(
MI);
2163 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
2175 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
2178 if (
Def->mayRaiseFPException())
2181 MachineOperand *DefClamp =
TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2185 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
2191 Register MIDstReg =
MI.getOperand(0).getReg();
2192 if (
TRI->isSGPRReg(*
MRI, DefReg)) {
2199 MRI->replaceRegWith(MIDstReg, DefReg);
2201 MI.eraseFromParent();
2206 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2207 Def->eraseFromParent();
2214 case AMDGPU::V_MUL_F64_e64:
2215 case AMDGPU::V_MUL_F64_pseudo_e64: {
2217 case 0x3fe0000000000000:
2219 case 0x4000000000000000:
2221 case 0x4010000000000000:
2227 case AMDGPU::V_MUL_F32_e64: {
2228 switch (
static_cast<uint32_t>(Val)) {
2239 case AMDGPU::V_MUL_F16_e64:
2240 case AMDGPU::V_MUL_F16_t16_e64:
2241 case AMDGPU::V_MUL_F16_fake16_e64: {
2242 switch (
static_cast<uint16_t>(Val)) {
2261std::pair<const MachineOperand *, int>
2262SIFoldOperandsImpl::isOMod(
const MachineInstr &
MI)
const {
2263 unsigned Op =
MI.getOpcode();
2265 case AMDGPU::V_MUL_F64_e64:
2266 case AMDGPU::V_MUL_F64_pseudo_e64:
2267 case AMDGPU::V_MUL_F32_e64:
2268 case AMDGPU::V_MUL_F16_t16_e64:
2269 case AMDGPU::V_MUL_F16_fake16_e64:
2270 case AMDGPU::V_MUL_F16_e64: {
2272 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
2274 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2275 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
2276 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2279 MI.mayRaiseFPException())
2282 const MachineOperand *RegOp =
nullptr;
2283 const MachineOperand *ImmOp =
nullptr;
2284 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2285 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2286 if (Src0->
isImm()) {
2289 }
else if (Src1->
isImm()) {
2297 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
2298 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
2299 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
2300 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
2303 return std::pair(RegOp, OMod);
2305 case AMDGPU::V_ADD_F64_e64:
2306 case AMDGPU::V_ADD_F64_pseudo_e64:
2307 case AMDGPU::V_ADD_F32_e64:
2308 case AMDGPU::V_ADD_F16_e64:
2309 case AMDGPU::V_ADD_F16_t16_e64:
2310 case AMDGPU::V_ADD_F16_fake16_e64: {
2312 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
2314 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2315 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
2316 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2321 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2322 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2326 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
2327 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
2328 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
2329 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2340bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &
MI) {
2341 const MachineOperand *RegOp;
2343 std::tie(RegOp, OMod) = isOMod(
MI);
2345 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
2346 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
2350 MachineOperand *DefOMod =
TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2354 if (
Def->mayRaiseFPException())
2359 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2365 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
2368 MRI->clearKillFlags(
Def->getOperand(0).getReg());
2369 MI.eraseFromParent();
2374 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2375 Def->eraseFromParent();
2382bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &
MI) {
2384 auto Reg =
MI.getOperand(0).getReg();
2387 !
MRI->hasOneNonDBGUse(
Reg))
2391 if (!getRegSeqInit(Defs,
Reg))
2394 for (
auto &[
Op, SubIdx] : Defs) {
2397 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
2400 const MachineInstr *SubDef =
MRI->getVRegDef(
Op->getReg());
2407 MachineOperand *
Op = &*
MRI->use_nodbg_begin(
Reg);
2408 MachineInstr *
UseMI =
Op->getParent();
2417 if (
Op->getSubReg())
2422 const TargetRegisterClass *OpRC =
TII->getRegClass(InstDesc,
OpIdx,
TRI);
2423 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
2426 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg));
2427 auto Dst =
MRI->createVirtualRegister(NewDstRC);
2429 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2431 for (
auto &[Def, SubIdx] : Defs) {
2432 Def->setIsKill(
false);
2436 MachineInstr *SubDef =
MRI->getVRegDef(
Def->getReg());
2446 RS->eraseFromParent();
2454 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
2455 MI.eraseFromParent();
2463 Register &OutReg,
unsigned &OutSubReg) {
2473 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
2474 OutReg = CopySrcReg;
2483 if (!CopySrcDef || !CopySrcDef->
isCopy())
2490 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
2491 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
2494 OutReg = OtherCopySrcReg;
2528bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &
PHI) {
2532 if (!
TRI->isVGPR(*
MRI, PhiOut))
2537 const TargetRegisterClass *ARC =
nullptr;
2538 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2539 MachineOperand &MO =
PHI.getOperand(K);
2541 if (!Copy || !
Copy->isCopy())
2545 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2549 const TargetRegisterClass *CopyInRC =
MRI->getRegClass(AGPRSrc);
2550 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2561 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2565 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2566 MachineOperand &MO =
PHI.getOperand(K);
2570 MachineBasicBlock *InsertMBB =
nullptr;
2573 unsigned CopyOpc = AMDGPU::COPY;
2574 if (MachineInstr *Def =
MRI->getVRegDef(
Reg)) {
2578 if (
Def->isCopy()) {
2580 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2593 MachineOperand &CopyIn =
Def->getOperand(1);
2596 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2599 InsertMBB =
Def->getParent();
2606 Register NewReg =
MRI->createVirtualRegister(ARC);
2607 MachineInstr *
MI =
BuildMI(*InsertMBB, InsertPt,
PHI.getDebugLoc(),
2608 TII->get(CopyOpc), NewReg)
2617 Register NewReg =
MRI->createVirtualRegister(ARC);
2618 PHI.getOperand(0).setReg(NewReg);
2624 TII->get(AMDGPU::COPY), PhiOut)
2632bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &
MI) {
2637 MachineOperand &
Def =
MI.getOperand(0);
2654 while (!
Users.empty()) {
2655 const MachineInstr *
I =
Users.pop_back_val();
2656 if (!
I->isCopy() && !
I->isRegSequence())
2658 Register DstReg =
I->getOperand(0).getReg();
2662 if (
TRI->isAGPR(*
MRI, DstReg))
2665 for (
const MachineInstr &U :
MRI->use_nodbg_instructions(DstReg))
2666 Users.push_back(&U);
2669 const TargetRegisterClass *RC =
MRI->getRegClass(DefReg);
2670 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
2671 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2672 MRI->setRegClass(DefReg, RC);
2676 while (!MoveRegs.
empty()) {
2678 MRI->setRegClass(
Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg)));
2718bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &
MBB) {
2725 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2728 for (
auto &
MI :
MBB) {
2732 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
2735 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2736 MachineOperand &PhiMO =
MI.getOperand(K);
2746 for (
const auto &[Entry, MOs] : RegToMO) {
2747 if (MOs.size() == 1)
2751 MachineInstr *
Def =
MRI->getVRegDef(
Reg);
2752 MachineBasicBlock *DefMBB =
Def->getParent();
2758 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2759 MachineInstr *VGPRCopy =
2761 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2765 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2767 TII->get(AMDGPU::COPY), TempAGPR)
2771 for (MachineOperand *MO : MOs) {
2783bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2788 TRI = &
TII->getRegisterInfo();
2789 MFI = MF.
getInfo<SIMachineFunctionInfo>();
2800 MachineOperand *CurrentKnownM0Val =
nullptr;
2804 if (tryFoldZeroHighBits(
MI)) {
2809 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2814 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2819 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2824 if (
TII->isFoldableCopy(
MI)) {
2825 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2830 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2831 CurrentKnownM0Val =
nullptr;
2850 bool Changed = SIFoldOperandsImpl().run(MF);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
bool hasNoSignedZerosFPMath() const
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
@ OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_AC_FP64
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.