23#define DEBUG_TYPE "si-fold-operands"
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
49 FoldableDef() =
delete;
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.
getType()) {
55 ImmToFold = FoldOp.
getImm();
56 }
else if (FoldOp.
isFI()) {
57 FrameIndexToFold = FoldOp.
getIndex();
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
73 FoldableDef Copy(*
this);
74 Copy.DefSubReg =
TRI.composeSubRegIndices(DefSubReg, SubReg);
82 return OpToFold->getReg();
85 unsigned getSubReg()
const {
87 return OpToFold->getSubReg();
98 return FrameIndexToFold;
106 std::optional<int64_t> getEffectiveImmVal()
const {
114 unsigned OpIdx)
const {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
127 if (DefSubReg != AMDGPU::NoSubRegister)
135 if (DefSubReg != AMDGPU::NoSubRegister)
137 return TII.isOperandLegal(
MI,
OpIdx, OpToFold);
144struct FoldCandidate {
152 bool Commuted =
false,
int ShrinkOp = -1)
153 :
UseMI(
MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
156 bool isFI()
const {
return Def.isFI(); }
160 return Def.FrameIndexToFold;
163 bool isImm()
const {
return Def.isImm(); }
165 bool isReg()
const {
return Def.isReg(); }
169 bool isGlobal()
const {
return Def.isGlobal(); }
171 bool needsShrink()
const {
return ShrinkOpcode != -1; }
174class SIFoldOperandsImpl {
184 const FoldableDef &OpToFold)
const;
187 unsigned convertToVALUOp(
unsigned Opc,
bool UseVOP3 =
false)
const {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
201 return AMDGPU::INSTRUCTION_LIST_END;
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(
Register DstReg,
Register SrcReg,
211 int64_t ImmVal)
const;
215 int64_t ImmVal)
const;
219 const FoldableDef &OpToFold)
const;
228 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
231 std::pair<int64_t, const TargetRegisterClass *>
248 bool foldInstOperand(
MachineInstr &
MI,
const FoldableDef &OpToFold)
const;
250 bool foldCopyToAGPRRegSequence(
MachineInstr *CopyMI)
const;
257 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
266 SIFoldOperandsImpl() =
default;
280 return SIFoldOperandsImpl().run(MF);
283 StringRef getPassName()
const override {
return "SI Fold Operands"; }
300char SIFoldOperandsLegacy::
ID = 0;
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
334 return AMDGPU::INSTRUCTION_LIST_END;
340 const FoldableDef &OpToFold)
const {
341 if (!OpToFold.isFI())
344 const unsigned Opc =
UseMI.getOpcode();
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
364 return OpNo == AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
368 int SIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
372 int VIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 if (
TRI->isVGPR(*MRI, DstReg) &&
TRI->isSGPRReg(*MRI, SrcReg) &&
384 if (!Def ||
Def->getNumOperands() != 4)
387 MachineOperand *Src0 = &
Def->getOperand(1);
388 MachineOperand *Src1 = &
Def->getOperand(2);
399 const bool UseVOP3 = !Src0->
isImm() ||
TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(
Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !
Def->getOperand(3).isDead())
405 MachineBasicBlock *
MBB =
Def->getParent();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder
Add =
411 if (
Add->getDesc().getNumDefs() == 2) {
413 Add.addDef(CarryOutReg, RegState::Dead);
417 Add.add(*Src0).add(*Src1).setMIFlags(
Def->getFlags());
421 Def->eraseFromParent();
422 MI.eraseFromParent();
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
437 Def->eraseFromParent();
438 MI.eraseFromParent();
447 return new SIFoldOperandsLegacy();
450bool SIFoldOperandsImpl::canUseImmWithOpSel(
const MachineInstr *
MI,
452 int64_t ImmVal)
const {
453 const uint64_t TSFlags =
MI->getDesc().TSFlags;
461 int OpNo =
MI->getOperandNo(&Old);
463 unsigned Opcode =
MI->getOpcode();
464 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *
MI,
unsigned UseOpNo,
487 int64_t ImmVal)
const {
488 MachineOperand &Old =
MI->getOperand(UseOpNo);
489 unsigned Opcode =
MI->getOpcode();
490 int OpNo =
MI->getOperandNo(&Old);
491 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
508 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
511 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &
Mod =
MI->getOperand(ModIdx);
518 unsigned ModVal =
Mod.getImm();
524 uint32_t
Imm = (
static_cast<uint32_t
>(ImmHi) << 16) | ImmLo;
529 auto tryFoldToInline = [&](uint32_t
Imm) ->
bool {
538 uint16_t
Lo =
static_cast<uint16_t
>(
Imm);
539 uint16_t
Hi =
static_cast<uint16_t
>(
Imm >> 16);
542 Mod.setImm(NewModVal);
547 if (
static_cast<int16_t
>(
Lo) < 0) {
548 int32_t SExt =
static_cast<int16_t
>(
Lo);
550 Mod.setImm(NewModVal);
565 uint32_t Swapped = (
static_cast<uint32_t
>(
Lo) << 16) |
Hi;
576 if (tryFoldToInline(Imm))
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
593 uint16_t NegLo = -
static_cast<uint16_t
>(
Imm);
594 uint16_t NegHi = -
static_cast<uint16_t
>(
Imm >> 16);
595 uint32_t NegImm = (
static_cast<uint32_t
>(NegHi) << 16) | NegLo;
597 if (tryFoldToInline(NegImm)) {
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(
TII->get(NegOpcode));
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold)
const {
610 MachineInstr *
MI = Fold.UseMI;
611 MachineOperand &Old =
MI->getOperand(Fold.UseOpNo);
614 std::optional<int64_t> ImmVal;
616 ImmVal = Fold.Def.getEffectiveImmVal();
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
625 int OpNo =
MI->getOperandNo(&Old);
626 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *
MBB =
MI->getParent();
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 =
MI->getOperand(0);
642 MachineOperand &Dst1 =
MI->getOperand(1);
650 MachineInstr *Inst32 =
TII->buildShrunkInst(*
MI, Op32);
652 if (HaveNonDbgCarryUse) {
655 .
addReg(AMDGPU::VCC, RegState::Kill);
665 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
666 MI->removeOperand(
I);
667 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
670 TII->commuteInstruction(*Inst32,
false);
674 assert(!Fold.needsShrink() &&
"not handled");
679 if (NewMFMAOpc == -1)
681 MI->setDesc(
TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID =
MI->getDesc();
684 for (
unsigned I = 0;
I <
MI->getNumDefs(); ++
I)
686 MI->getOperand(
I).setIsEarlyClobber(
true);
691 int OpNo =
MI->getOperandNo(&Old);
692 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
699 if (Fold.isGlobal()) {
700 Old.
ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
711 MachineOperand *
New = Fold.Def.OpToFold;
714 if (
const TargetRegisterClass *OpRC =
715 TII->getRegClass(
MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI,
New->getReg());
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (
New->getSubReg()) {
722 TRI->getMatchingSuperRegClass(NewRC, OpRC,
New->getSubReg());
728 if (
New->getReg().isVirtual() &&
731 <<
TRI->getRegClassName(ConstrainRC) <<
'\n');
738 if (Old.
getSubReg() == AMDGPU::lo16 &&
TRI->isSGPRReg(*MRI,
New->getReg()))
740 if (
New->getReg().isPhysical()) {
750 FoldCandidate &&Entry) {
752 for (FoldCandidate &Fold : FoldList)
753 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
755 LLVM_DEBUG(
dbgs() <<
"Append " << (Entry.Commuted ?
"commuted" :
"normal")
756 <<
" operand " << Entry.UseOpNo <<
"\n " << *Entry.UseMI);
762 const FoldableDef &FoldOp,
763 bool Commuted =
false,
int ShrinkOp = -1) {
765 FoldCandidate(
MI, OpNo, FoldOp, Commuted, ShrinkOp));
773 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
783 const FoldableDef &OpToFold) {
784 assert(OpToFold.isImm() &&
"Expected immediate operand");
785 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
791bool SIFoldOperandsImpl::tryAddToFoldList(
792 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *
MI,
unsigned OpNo,
793 const FoldableDef &OpToFold)
const {
794 const unsigned Opc =
MI->getOpcode();
796 auto tryToFoldAsFMAAKorMK = [&]() {
797 if (!OpToFold.isImm())
800 const bool TryAK = OpNo == 3;
801 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
802 MI->setDesc(
TII->get(NewOpc));
805 bool FoldAsFMAAKorMK =
806 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
807 if (FoldAsFMAAKorMK) {
809 MI->untieRegOperand(3);
812 MachineOperand &Op1 =
MI->getOperand(1);
813 MachineOperand &Op2 =
MI->getOperand(2);
830 bool IsLegal = OpToFold.isOperandLegal(*
TII, *
MI, OpNo);
831 if (!IsLegal && OpToFold.isImm()) {
832 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
833 IsLegal = canUseImmWithOpSel(
MI, OpNo, *ImmVal);
839 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
842 MI->setDesc(
TII->get(NewOpc));
847 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
849 MI->untieRegOperand(OpNo);
853 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
859 if (
Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
860 if (tryToFoldAsFMAAKorMK())
865 if (OpToFold.isImm()) {
867 if (
Opc == AMDGPU::S_SETREG_B32)
868 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
869 else if (
Opc == AMDGPU::S_SETREG_B32_mode)
870 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
872 MI->setDesc(
TII->get(ImmOpc));
881 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
885 MachineOperand &
Op =
MI->getOperand(OpNo);
886 MachineOperand &CommutedOp =
MI->getOperand(CommuteOpNo);
892 if (!
Op.isReg() || !CommutedOp.
isReg())
897 if (
Op.isReg() && CommutedOp.
isReg() &&
898 (
Op.getReg() == CommutedOp.
getReg() &&
902 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
906 if (!OpToFold.isOperandLegal(*
TII, *
MI, CommuteOpNo)) {
907 if ((
Opc != AMDGPU::V_ADD_CO_U32_e64 &&
Opc != AMDGPU::V_SUB_CO_U32_e64 &&
908 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
909 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
910 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
916 MachineOperand &OtherOp =
MI->getOperand(OpNo);
917 if (!OtherOp.
isReg() ||
924 unsigned MaybeCommutedOpc =
MI->getOpcode();
938 if (
Opc == AMDGPU::S_FMAC_F32 &&
939 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
940 if (tryToFoldAsFMAAKorMK())
946 if (OpToFold.isImm() &&
955bool SIFoldOperandsImpl::isUseSafeToFold(
const MachineInstr &
MI,
956 const MachineOperand &UseMO)
const {
958 return !
TII->isSDWA(
MI);
966 SubDef &&
TII.isFoldableCopy(*SubDef);
968 unsigned SrcIdx =
TII.getFoldableCopySrcIdx(*SubDef);
977 if (
SrcOp.getSubReg())
984const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
985 MachineInstr &RegSeq,
986 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs)
const {
990 const TargetRegisterClass *RC =
nullptr;
997 const TargetRegisterClass *OpRC =
getRegOpRC(*MRI, *
TRI, SrcOp);
1000 else if (!
TRI->getCommonSubClass(RC, OpRC))
1005 Defs.emplace_back(&SrcOp, SubRegIdx);
1010 if (DefSrc && (DefSrc->
isReg() || DefSrc->
isImm())) {
1011 Defs.emplace_back(DefSrc, SubRegIdx);
1015 Defs.emplace_back(&SrcOp, SubRegIdx);
1024const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1025 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1028 if (!Def || !
Def->isRegSequence())
1031 return getRegSeqInit(*Def, Defs);
1034std::pair<int64_t, const TargetRegisterClass *>
1035SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq)
const {
1037 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1041 bool TryToMatchSplat64 =
false;
1044 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
1045 const MachineOperand *
Op = Defs[
I].first;
1049 int64_t SubImm =
Op->getImm();
1055 if (Imm != SubImm) {
1056 if (
I == 1 && (
E & 1) == 0) {
1059 TryToMatchSplat64 =
true;
1067 if (!TryToMatchSplat64)
1068 return {Defs[0].first->getImm(), SrcRC};
1073 for (
unsigned I = 0,
E = Defs.
size();
I !=
E;
I += 2) {
1074 const MachineOperand *Op0 = Defs[
I].first;
1075 const MachineOperand *Op1 = Defs[
I + 1].first;
1080 unsigned SubReg0 = Defs[
I].second;
1081 unsigned SubReg1 = Defs[
I + 1].second;
1085 if (
TRI->getChannelFromSubReg(SubReg0) + 1 !=
1086 TRI->getChannelFromSubReg(SubReg1))
1091 SplatVal64 = MergedVal;
1092 else if (SplatVal64 != MergedVal)
1096 const TargetRegisterClass *RC64 =
TRI->getSubRegisterClass(
1099 return {SplatVal64, RC64};
1102bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1103 MachineInstr *
UseMI,
unsigned UseOpIdx, int64_t SplatVal,
1104 const TargetRegisterClass *SplatRC)
const {
1106 if (UseOpIdx >=
Desc.getNumOperands())
1113 int16_t RCID =
TII->getOpRegClassID(
Desc.operands()[UseOpIdx]);
1117 const TargetRegisterClass *OpRC =
TRI->getRegClass(RCID);
1122 if (SplatVal != 0 && SplatVal != -1) {
1126 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
1132 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1137 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1143 if (!
TRI->getCommonSubClass(OpRC, SplatRC))
1148 if (!
TII->isOperandLegal(*
UseMI, UseOpIdx, &TmpOp))
1154bool SIFoldOperandsImpl::tryToFoldACImm(
1155 const FoldableDef &OpToFold, MachineInstr *
UseMI,
unsigned UseOpIdx,
1156 SmallVectorImpl<FoldCandidate> &FoldList)
const {
1158 if (UseOpIdx >=
Desc.getNumOperands())
1165 if (OpToFold.isImm() && OpToFold.isOperandLegal(*
TII, *
UseMI, UseOpIdx)) {
1176void SIFoldOperandsImpl::foldOperand(
1177 FoldableDef OpToFold, MachineInstr *
UseMI,
int UseOpIdx,
1178 SmallVectorImpl<FoldCandidate> &FoldList,
1179 SmallVectorImpl<MachineInstr *> &CopiesToReplace)
const {
1182 if (!isUseSafeToFold(*
UseMI, *UseOp))
1186 if (UseOp->
isReg() && OpToFold.isReg()) {
1190 if (UseOp->
getSubReg() != AMDGPU::NoSubRegister &&
1192 !
TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1204 const TargetRegisterClass *SplatRC;
1205 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*
UseMI);
1210 for (
unsigned I = 0;
I != UsesToProcess.size(); ++
I) {
1211 MachineOperand *RSUse = UsesToProcess[
I];
1212 MachineInstr *RSUseMI = RSUse->
getParent();
1222 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1223 FoldableDef SplatDef(SplatVal, SplatRC);
1230 if (RSUse->
getSubReg() != RegSeqDstSubReg)
1235 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
1242 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
1245 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
1250 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1256 MachineOperand &SOff =
1257 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
1268 TII->getNamedOperand(*
UseMI, AMDGPU::OpName::cpol)->getImm();
1283 bool FoldingImmLike =
1284 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1292 const TargetRegisterClass *SrcRC = MRI->
getRegClass(SrcReg);
1300 const TargetRegisterClass *DestRC =
TRI->getRegClassForReg(*MRI, DestReg);
1303 for (
unsigned MovOp :
1304 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1305 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1306 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1307 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1308 const MCInstrDesc &MovDesc =
TII->get(MovOp);
1309 const TargetRegisterClass *MovDstRC =
1318 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1320 int16_t RegClassID =
TII->getOpRegClassID(MovDesc.
operands()[SrcIdx]);
1321 if (RegClassID != -1) {
1322 const TargetRegisterClass *MovSrcRC =
TRI->getRegClass(RegClassID);
1325 MovSrcRC =
TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1329 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1330 (!OpToFold.isImm() ||
1331 !
TII->isImmOperandLegal(MovDesc, SrcIdx,
1332 *OpToFold.getEffectiveImmVal())))
1345 if (!OpToFold.isImm() ||
1346 !
TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1352 while (ImpOpI != ImpOpE) {
1359 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1361 MachineOperand NewSrcOp(SrcOp);
1383 LLVM_DEBUG(
dbgs() <<
"Folding " << OpToFold.OpToFold <<
"\n into "
1388 unsigned SubRegIdx = OpToFold.getSubReg();
1402 static_assert(AMDGPU::sub1_hi16 == 12,
"Subregister layout has changed");
1407 if (SubRegIdx > AMDGPU::sub1) {
1408 LaneBitmask
M =
TRI->getSubRegIndexLaneMask(SubRegIdx);
1409 M |=
M.getLane(
M.getHighestLane() - 1);
1410 SmallVector<unsigned, 4> Indexes;
1411 TRI->getCoveringSubRegIndexes(
TRI->getRegClassForReg(*MRI,
UseReg), M,
1413 assert(Indexes.
size() == 1 &&
"Expected one 32-bit subreg to cover");
1414 SubRegIdx = Indexes[0];
1416 }
else if (
TII->getOpSize(*
UseMI, 1) == 4)
1419 SubRegIdx = AMDGPU::sub0;
1424 OpToFold.OpToFold->setIsKill(
false);
1428 if (foldCopyToAGPRRegSequence(
UseMI))
1433 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1434 (UseOpc == AMDGPU::V_READLANE_B32 &&
1436 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1441 if (FoldingImmLike) {
1444 *OpToFold.DefMI, *
UseMI))
1450 if (OpToFold.isImm()) {
1452 *OpToFold.getEffectiveImmVal());
1453 }
else if (OpToFold.isFI())
1456 assert(OpToFold.isGlobal());
1458 OpToFold.OpToFold->getOffset(),
1459 OpToFold.OpToFold->getTargetFlags());
1465 if (OpToFold.isReg() &&
TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1468 *OpToFold.DefMI, *
UseMI))
1490 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1498 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, OpToFold);
1504 case AMDGPU::S_ADD_I32:
1505 case AMDGPU::S_ADD_U32:
1508 case AMDGPU::S_SUB_I32:
1509 case AMDGPU::S_SUB_U32:
1512 case AMDGPU::V_AND_B32_e64:
1513 case AMDGPU::V_AND_B32_e32:
1514 case AMDGPU::S_AND_B32:
1517 case AMDGPU::V_OR_B32_e64:
1518 case AMDGPU::V_OR_B32_e32:
1519 case AMDGPU::S_OR_B32:
1522 case AMDGPU::V_XOR_B32_e64:
1523 case AMDGPU::V_XOR_B32_e32:
1524 case AMDGPU::S_XOR_B32:
1527 case AMDGPU::S_XNOR_B32:
1530 case AMDGPU::S_NAND_B32:
1533 case AMDGPU::S_NOR_B32:
1536 case AMDGPU::S_ANDN2_B32:
1539 case AMDGPU::S_ORN2_B32:
1542 case AMDGPU::V_LSHL_B32_e64:
1543 case AMDGPU::V_LSHL_B32_e32:
1544 case AMDGPU::S_LSHL_B32:
1546 Result =
LHS << (
RHS & 31);
1548 case AMDGPU::V_LSHLREV_B32_e64:
1549 case AMDGPU::V_LSHLREV_B32_e32:
1550 Result =
RHS << (
LHS & 31);
1552 case AMDGPU::V_LSHR_B32_e64:
1553 case AMDGPU::V_LSHR_B32_e32:
1554 case AMDGPU::S_LSHR_B32:
1555 Result =
LHS >> (
RHS & 31);
1557 case AMDGPU::V_LSHRREV_B32_e64:
1558 case AMDGPU::V_LSHRREV_B32_e32:
1559 Result =
RHS >> (
LHS & 31);
1561 case AMDGPU::V_ASHR_I32_e64:
1562 case AMDGPU::V_ASHR_I32_e32:
1563 case AMDGPU::S_ASHR_I32:
1564 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1566 case AMDGPU::V_ASHRREV_I32_e64:
1567 case AMDGPU::V_ASHRREV_I32_e32:
1568 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1576 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1582bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *
MI)
const {
1583 if (!
MI->allImplicitDefsAreDead())
1586 unsigned Opc =
MI->getOpcode();
1588 int Src0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0);
1592 MachineOperand *Src0 = &
MI->getOperand(Src0Idx);
1593 std::optional<int64_t> Src0Imm =
TII->getImmOrMaterializedImm(*Src0);
1595 if ((
Opc == AMDGPU::V_NOT_B32_e64 ||
Opc == AMDGPU::V_NOT_B32_e32 ||
1596 Opc == AMDGPU::S_NOT_B32) &&
1598 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1599 TII->mutateAndCleanupImplicit(
1604 int Src1Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1);
1608 MachineOperand *Src1 = &
MI->getOperand(Src1Idx);
1609 std::optional<int64_t> Src1Imm =
TII->getImmOrMaterializedImm(*Src1);
1611 if (!Src0Imm && !Src1Imm)
1617 if (Src0Imm && Src1Imm) {
1622 bool IsSGPR =
TRI->isSGPRReg(*MRI,
MI->getOperand(0).getReg());
1626 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1627 MI->removeOperand(Src1Idx);
1634 if (
Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_SUB_U32) {
1635 if (Src1Imm &&
static_cast<int32_t
>(*Src1Imm) == 0) {
1637 MI->removeOperand(Src1Idx);
1638 TII->mutateAndCleanupImplicit(*
MI,
TII->get(AMDGPU::COPY));
1644 if (!
MI->isCommutable())
1647 if (Src0Imm && !Src1Imm) {
1653 int32_t Src1Val =
static_cast<int32_t
>(*Src1Imm);
1654 if (
Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_ADD_U32) {
1657 MI->removeOperand(Src1Idx);
1658 TII->mutateAndCleanupImplicit(*
MI,
TII->get(AMDGPU::COPY));
1664 if (
Opc == AMDGPU::V_OR_B32_e64 ||
1665 Opc == AMDGPU::V_OR_B32_e32 ||
1666 Opc == AMDGPU::S_OR_B32) {
1669 MI->removeOperand(Src1Idx);
1670 TII->mutateAndCleanupImplicit(*
MI,
TII->get(AMDGPU::COPY));
1671 }
else if (Src1Val == -1) {
1673 MI->removeOperand(Src0Idx);
1674 TII->mutateAndCleanupImplicit(
1682 if (
Opc == AMDGPU::V_AND_B32_e64 ||
Opc == AMDGPU::V_AND_B32_e32 ||
1683 Opc == AMDGPU::S_AND_B32) {
1686 MI->removeOperand(Src0Idx);
1687 TII->mutateAndCleanupImplicit(
1689 }
else if (Src1Val == -1) {
1691 MI->removeOperand(Src1Idx);
1692 TII->mutateAndCleanupImplicit(*
MI,
TII->get(AMDGPU::COPY));
1699 if (
Opc == AMDGPU::V_XOR_B32_e64 ||
Opc == AMDGPU::V_XOR_B32_e32 ||
1700 Opc == AMDGPU::S_XOR_B32) {
1703 MI->removeOperand(Src1Idx);
1704 TII->mutateAndCleanupImplicit(*
MI,
TII->get(AMDGPU::COPY));
1713bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &
MI)
const {
1714 unsigned Opc =
MI.getOpcode();
1715 if (
Opc != AMDGPU::V_CNDMASK_B32_e32 &&
Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1716 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1719 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1720 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1722 std::optional<int64_t> Src1Imm =
TII->getImmOrMaterializedImm(*Src1);
1726 std::optional<int64_t> Src0Imm =
TII->getImmOrMaterializedImm(*Src0);
1727 if (!Src0Imm || *Src0Imm != *Src1Imm)
1732 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1_modifiers);
1734 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0_modifiers);
1735 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1736 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1742 int Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
1744 MI.removeOperand(Src2Idx);
1745 MI.removeOperand(AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1));
1746 if (Src1ModIdx != -1)
1747 MI.removeOperand(Src1ModIdx);
1748 if (Src0ModIdx != -1)
1749 MI.removeOperand(Src0ModIdx);
1750 TII->mutateAndCleanupImplicit(
MI, NewDesc);
1755bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &
MI)
const {
1756 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1757 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1760 std::optional<int64_t> Src0Imm =
1761 TII->getImmOrMaterializedImm(
MI.getOperand(1));
1762 if (!Src0Imm || *Src0Imm != 0xffff || !
MI.getOperand(2).isReg())
1766 MachineInstr *SrcDef = MRI->
getVRegDef(Src1);
1772 if (!
MI.getOperand(2).isKill())
1774 MI.eraseFromParent();
1778bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &
MI,
1779 const FoldableDef &OpToFold)
const {
1783 SmallVector<MachineInstr *, 4> CopiesToReplace;
1785 MachineOperand &Dst =
MI.getOperand(0);
1788 if (OpToFold.isImm()) {
1799 if (tryConstantFoldOp(&
UseMI)) {
1808 for (
auto *U : UsesToProcess) {
1809 MachineInstr *
UseMI =
U->getParent();
1811 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*
TRI,
U->getSubReg());
1816 if (CopiesToReplace.
empty() && FoldList.
empty())
1819 MachineFunction *MF =
MI.getMF();
1821 for (MachineInstr *Copy : CopiesToReplace)
1822 Copy->addImplicitDefUseOperands(*MF);
1824 SetVector<MachineInstr *> ConstantFoldCandidates;
1825 for (FoldCandidate &Fold : FoldList) {
1826 assert(!Fold.isReg() || Fold.Def.OpToFold);
1827 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1829 const MachineInstr *
DefMI = Fold.Def.DefMI;
1837 assert(Fold.Def.OpToFold && Fold.isReg());
1844 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1848 ConstantFoldCandidates.
insert(Fold.UseMI);
1850 }
else if (Fold.Commuted) {
1852 TII->commuteInstruction(*Fold.UseMI,
false);
1856 for (MachineInstr *
MI : ConstantFoldCandidates) {
1857 if (tryConstantFoldOp(
MI)) {
1867bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI)
const {
1872 const TargetRegisterClass *DefRC =
1874 if (!
TRI->isAGPRClass(DefRC))
1886 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1888 const TargetRegisterClass *UseRC =
1895 unsigned NumFoldable = 0;
1897 for (
unsigned I = 1;
I != NumRegSeqOperands;
I += 2) {
1913 const TargetRegisterClass *DestSuperRC =
TRI->getMatchingSuperRegClass(
1914 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1923 const TargetRegisterClass *InputRC =
1933 const TargetRegisterClass *MatchRC =
1934 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1945 if (NumFoldable == 0)
1948 CopyMI->
setDesc(
TII->get(AMDGPU::REG_SEQUENCE));
1952 for (
auto [Def, DestSubIdx] : NewDefs) {
1953 if (!
Def->isReg()) {
1957 BuildMI(
MBB, CopyMI,
DL,
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1962 Def->setIsKill(
false);
1964 Register &VGPRCopy = VGPRCopies[Src];
1966 const TargetRegisterClass *VGPRUseSubRC =
1967 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1976 const TargetRegisterClass *SubRC =
1992 B.addImm(DestSubIdx);
1999bool SIFoldOperandsImpl::tryFoldFoldableCopy(
2000 MachineInstr &
MI, MachineOperand *&CurrentKnownM0Val)
const {
2004 if (DstReg == AMDGPU::M0) {
2005 MachineOperand &NewM0Val =
MI.getOperand(1);
2006 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
2007 MI.eraseFromParent();
2018 MachineOperand *OpToFoldPtr;
2019 if (
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2021 if (
TII->hasAnyModifiersSet(
MI))
2023 OpToFoldPtr = &
MI.getOperand(2);
2025 OpToFoldPtr = &
MI.getOperand(1);
2026 MachineOperand &OpToFold = *OpToFoldPtr;
2030 if (!FoldingImm && !OpToFold.
isReg())
2035 !
TRI->isConstantPhysReg(OpToFold.
getReg()))
2047 const TargetRegisterClass *DstRC =
2064 if (
MI.getOpcode() == AMDGPU::COPY && OpToFold.
isReg() &&
2066 if (DstRC == &AMDGPU::SReg_32RegClass &&
2075 if (OpToFold.
isReg() &&
MI.isCopy() && !
MI.getOperand(1).getSubReg()) {
2076 if (foldCopyToAGPRRegSequence(&
MI))
2080 FoldableDef
Def(OpToFold, DstRC);
2081 bool Changed = foldInstOperand(
MI, Def);
2088 auto *InstToErase = &
MI;
2090 auto &SrcOp = InstToErase->getOperand(1);
2092 InstToErase->eraseFromParent();
2094 InstToErase =
nullptr;
2098 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
2102 if (InstToErase && InstToErase->isRegSequence() &&
2104 InstToErase->eraseFromParent();
2114 return OpToFold.
isReg() &&
2115 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.
getReg(),
MI);
2120const MachineOperand *
2121SIFoldOperandsImpl::isClamp(
const MachineInstr &
MI)
const {
2122 unsigned Op =
MI.getOpcode();
2124 case AMDGPU::V_MAX_F32_e64:
2125 case AMDGPU::V_MAX_F16_e64:
2126 case AMDGPU::V_MAX_F16_t16_e64:
2127 case AMDGPU::V_MAX_F16_fake16_e64:
2128 case AMDGPU::V_MAX_F64_e64:
2129 case AMDGPU::V_MAX_NUM_F64_e64:
2130 case AMDGPU::V_PK_MAX_F16:
2131 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2132 case AMDGPU::V_PK_MAX_NUM_BF16: {
2133 if (
MI.mayRaiseFPException())
2136 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
2140 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2141 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2145 Src0->
getSubReg() != AMDGPU::NoSubRegister)
2149 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2153 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
2155 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
2159 unsigned UnsetMods =
2160 (
Op == AMDGPU::V_PK_MAX_F16 ||
Op == AMDGPU::V_PK_MAX_NUM_BF16)
2163 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2173bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &
MI) {
2174 const MachineOperand *ClampSrc = isClamp(
MI);
2187 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
2190 if (
Def->mayRaiseFPException())
2193 MachineOperand *DefClamp =
TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2197 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
2203 Register MIDstReg =
MI.getOperand(0).getReg();
2204 if (
TRI->isSGPRReg(*MRI, DefReg)) {
2213 MI.eraseFromParent();
2218 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2219 Def->eraseFromParent();
2226 case AMDGPU::V_MUL_F64_e64:
2227 case AMDGPU::V_MUL_F64_pseudo_e64: {
2229 case 0x3fe0000000000000:
2231 case 0x4000000000000000:
2233 case 0x4010000000000000:
2239 case AMDGPU::V_MUL_F32_e64: {
2240 switch (
static_cast<uint32_t>(Val)) {
2251 case AMDGPU::V_MUL_F16_e64:
2252 case AMDGPU::V_MUL_F16_t16_e64:
2253 case AMDGPU::V_MUL_F16_fake16_e64: {
2254 switch (
static_cast<uint16_t>(Val)) {
2273std::pair<const MachineOperand *, int>
2274SIFoldOperandsImpl::isOMod(
const MachineInstr &
MI)
const {
2275 unsigned Op =
MI.getOpcode();
2277 case AMDGPU::V_MUL_F64_e64:
2278 case AMDGPU::V_MUL_F64_pseudo_e64:
2279 case AMDGPU::V_MUL_F32_e64:
2280 case AMDGPU::V_MUL_F16_t16_e64:
2281 case AMDGPU::V_MUL_F16_fake16_e64:
2282 case AMDGPU::V_MUL_F16_e64: {
2284 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
2286 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2287 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
2288 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2291 MI.mayRaiseFPException())
2294 const MachineOperand *RegOp =
nullptr;
2295 const MachineOperand *ImmOp =
nullptr;
2296 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2297 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2298 if (Src0->
isImm()) {
2301 }
else if (Src1->
isImm()) {
2309 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
2310 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
2311 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
2312 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
2315 return std::pair(RegOp, OMod);
2317 case AMDGPU::V_ADD_F64_e64:
2318 case AMDGPU::V_ADD_F64_pseudo_e64:
2319 case AMDGPU::V_ADD_F32_e64:
2320 case AMDGPU::V_ADD_F16_e64:
2321 case AMDGPU::V_ADD_F16_t16_e64:
2322 case AMDGPU::V_ADD_F16_fake16_e64: {
2324 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
2326 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2327 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
2328 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2333 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2334 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2338 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
2339 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
2340 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
2341 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2352bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &
MI) {
2353 const MachineOperand *RegOp;
2355 std::tie(RegOp, OMod) = isOMod(
MI);
2357 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
2362 MachineOperand *DefOMod =
TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2366 if (
Def->mayRaiseFPException())
2371 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2381 MI.eraseFromParent();
2386 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2387 Def->eraseFromParent();
2394bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &
MI) {
2396 auto Reg =
MI.getOperand(0).getReg();
2398 if (!ST->hasGFX90AInsts() || !
TRI->isVGPR(*MRI,
Reg) ||
2403 if (!getRegSeqInit(Defs,
Reg))
2406 for (
auto &[
Op, SubIdx] : Defs) {
2409 if (
TRI->isAGPR(*MRI,
Op->getReg()))
2412 const MachineInstr *SubDef = MRI->
getVRegDef(
Op->getReg());
2420 MachineInstr *
UseMI =
Op->getParent();
2429 if (
Op->getSubReg())
2434 const TargetRegisterClass *OpRC =
TII->getRegClass(InstDesc,
OpIdx);
2435 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
2441 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2443 for (
auto &[Def, SubIdx] : Defs) {
2444 Def->setIsKill(
false);
2445 if (
TRI->isAGPR(*MRI,
Def->getReg())) {
2458 RS->eraseFromParent();
2467 MI.eraseFromParent();
2475 Register &OutReg,
unsigned &OutSubReg) {
2485 if (
TRI.isAGPR(MRI, CopySrcReg)) {
2486 OutReg = CopySrcReg;
2495 if (!CopySrcDef || !CopySrcDef->
isCopy())
2502 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
2503 !
TRI.isAGPR(MRI, OtherCopySrcReg))
2506 OutReg = OtherCopySrcReg;
2540bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &
PHI) {
2544 if (!
TRI->isVGPR(*MRI, PhiOut))
2549 const TargetRegisterClass *ARC =
nullptr;
2550 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2551 MachineOperand &MO =
PHI.getOperand(K);
2553 if (!Copy || !
Copy->isCopy())
2557 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2561 const TargetRegisterClass *CopyInRC = MRI->
getRegClass(AGPRSrc);
2562 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2573 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2577 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2578 MachineOperand &MO =
PHI.getOperand(K);
2582 MachineBasicBlock *InsertMBB =
nullptr;
2585 unsigned CopyOpc = AMDGPU::COPY;
2590 if (
Def->isCopy()) {
2592 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2605 MachineOperand &CopyIn =
Def->getOperand(1);
2608 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2611 InsertMBB =
Def->getParent();
2619 MachineInstr *
MI =
BuildMI(*InsertMBB, InsertPt,
PHI.getDebugLoc(),
2620 TII->get(CopyOpc), NewReg)
2630 PHI.getOperand(0).setReg(NewReg);
2636 TII->get(AMDGPU::COPY), PhiOut)
2644bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &
MI) {
2646 if (!ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
2649 MachineOperand &
Def =
MI.getOperand(0);
2666 while (!
Users.empty()) {
2667 const MachineInstr *
I =
Users.pop_back_val();
2668 if (!
I->isCopy() && !
I->isRegSequence())
2670 Register DstReg =
I->getOperand(0).getReg();
2674 if (
TRI->isAGPR(*MRI, DstReg))
2678 Users.push_back(&U);
2681 const TargetRegisterClass *RC = MRI->
getRegClass(DefReg);
2683 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2688 while (!MoveRegs.
empty()) {
2730bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &
MBB) {
2733 if (ST->hasGFX90AInsts())
2737 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2740 for (
auto &
MI :
MBB) {
2744 if (!
TRI->isAGPR(*MRI,
MI.getOperand(0).getReg()))
2747 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2748 MachineOperand &PhiMO =
MI.getOperand(K);
2758 for (
const auto &[Entry, MOs] : RegToMO) {
2759 if (MOs.size() == 1)
2764 MachineBasicBlock *DefMBB =
Def->getParent();
2768 const TargetRegisterClass *ARC =
getRegOpRC(*MRI, *
TRI, *MOs.front());
2771 MachineInstr *VGPRCopy =
2773 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2779 TII->get(AMDGPU::COPY), TempAGPR)
2783 for (MachineOperand *MO : MOs) {
2795bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2801 MFI = MF.
getInfo<SIMachineFunctionInfo>();
2811 MachineOperand *CurrentKnownM0Val =
nullptr;
2815 if (tryFoldZeroHighBits(
MI)) {
2820 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2825 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2830 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2835 if (
TII->isFoldableCopy(
MI)) {
2836 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2841 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2842 CurrentKnownM0Val =
nullptr;
2860 bool Changed = SIFoldOperandsImpl().run(MF);
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
uint8_t OperandType
Information about the type of the operand.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
use_nodbg_iterator use_nodbg_begin(Register RegNo) const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool hasOneNonDBGUser(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug instruction using the specified regis...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
@ OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_AC_FP64
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.