31 #define DEBUG_TYPE "si-peephole-sdwa"
33 STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
35 "Number of instruction converted to SDWA.");
70 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
73 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
88 :
Target(TargetOp), Replaced(ReplacedOp) {
93 virtual ~SDWAOperand() =
default;
103 return &getParentInst()->getParent()->getParent()->getRegInfo();
106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
114 class SDWASrcOperand :
public SDWAOperand {
123 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
125 : SDWAOperand(TargetOp, ReplacedOp),
126 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_),
Sext(Sext_) {}
131 SdwaSel getSrcSel()
const {
return SrcSel; }
132 bool getAbs()
const {
return Abs; }
133 bool getNeg()
const {
return Neg; }
134 bool getSext()
const {
return Sext; }
139 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
144 class SDWADstOperand :
public SDWAOperand {
153 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
158 SdwaSel getDstSel()
const {
return DstSel; }
159 DstUnused getDstUnused()
const {
return DstUn; }
161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
166 class SDWADstPreserveOperand :
public SDWADstOperand {
174 Preserve(PreserveOp) {}
180 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
189 char SIPeepholeSDWA::
ID = 0;
194 return new SIPeepholeSDWA();
198 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
201 case BYTE_0: OS <<
"BYTE_0";
break;
202 case BYTE_1: OS <<
"BYTE_1";
break;
203 case BYTE_2: OS <<
"BYTE_2";
break;
204 case BYTE_3: OS <<
"BYTE_3";
break;
205 case WORD_0: OS <<
"WORD_0";
break;
206 case WORD_1: OS <<
"WORD_1";
break;
207 case DWORD: OS <<
"DWORD";
break;
223 OS <<
"SDWA src: " << *getTargetOperand()
224 <<
" src_sel:" << getSrcSel()
225 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
226 <<
" sext:" << getSext() <<
'\n';
231 OS <<
"SDWA dst: " << *getTargetOperand()
232 <<
" dst_sel:" << getDstSel()
233 <<
" dst_unused:" << getDstUnused() <<
'\n';
238 OS <<
"SDWA preserve dst: " << *getTargetOperand()
239 <<
" dst_sel:" << getDstSel()
240 <<
" preserve:" << *getPreservedOperand() <<
'\n';
258 return LHS.isReg() &&
260 LHS.getReg() ==
RHS.getReg() &&
261 LHS.getSubReg() ==
RHS.getSubReg();
266 if (!
Reg->isReg() || !
Reg->isDef())
295 for (
auto &DefMO : DefInstr->
defs()) {
296 if (DefMO.isReg() && DefMO.getReg() ==
Reg->getReg())
307 const auto *
MI =
SrcOp->getParent();
308 if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0) ==
SrcOp) {
309 if (
auto *Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0_modifiers)) {
310 Mods =
Mod->getImm();
312 }
else if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1) ==
SrcOp) {
313 if (
auto *Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1_modifiers)) {
314 Mods =
Mod->getImm();
319 "Float and integer src modifiers can't be set simultaneously");
342 bool IsPreserveSrc =
false;
346 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers);
347 assert(Src && (Src->isReg() || Src->isImm()));
348 if (!
isSameReg(*Src, *getReplacedOperand())) {
350 Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
351 SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_sel);
352 SrcMods =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers);
355 !
isSameReg(*Src, *getReplacedOperand())) {
364 TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
373 TII->getNamedImmOperand(
MI, AMDGPU::OpName::dst_sel));
376 IsPreserveSrc =
true;
378 AMDGPU::OpName::vdst);
379 auto TiedIdx =
MI.findTiedOperandIdx(DstIdx);
380 Src = &
MI.getOperand(TiedIdx);
389 assert(Src && Src->isReg());
391 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
392 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
393 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
394 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
395 !
isSameReg(*Src, *getReplacedOperand())) {
402 (IsPreserveSrc || (SrcSel && SrcMods)));
405 if (!IsPreserveSrc) {
406 SrcSel->
setImm(getSrcSel());
409 getTargetOperand()->setIsKill(
false);
425 if (&UseInst != ParentMI)
435 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
436 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
437 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
438 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
447 isSameReg(*Operand, *getReplacedOperand()));
451 DstSel->
setImm(getDstSel());
458 getParentInst()->eraseFromParent();
470 getMRI()->clearKillFlags(MO.getReg());
474 auto MBB =
MI.getParent();
480 MIB.addReg(getPreservedOperand()->
getReg(),
482 getPreservedOperand()->getSubReg());
486 MI.getNumOperands() - 1);
489 return SDWADstOperand::convertToSDWA(
MI,
TII);
505 if (!
TII->isFoldableCopy(*DefInst))
519 std::unique_ptr<SDWAOperand>
521 unsigned Opcode =
MI.getOpcode();
523 case AMDGPU::V_LSHRREV_B32_e32:
524 case AMDGPU::V_ASHRREV_I32_e32:
525 case AMDGPU::V_LSHLREV_B32_e32:
526 case AMDGPU::V_LSHRREV_B32_e64:
527 case AMDGPU::V_ASHRREV_I32_e64:
528 case AMDGPU::V_LSHLREV_B32_e64: {
538 auto Imm = foldToImm(*Src0);
542 if (*
Imm != 16 && *
Imm != 24)
550 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
551 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
552 return std::make_unique<SDWADstOperand>(
555 return std::make_unique<SDWASrcOperand>(
557 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
558 Opcode != AMDGPU::V_LSHRREV_B32_e64);
563 case AMDGPU::V_LSHRREV_B16_e32:
564 case AMDGPU::V_ASHRREV_I16_e32:
565 case AMDGPU::V_LSHLREV_B16_e32:
566 case AMDGPU::V_LSHRREV_B16_e64:
567 case AMDGPU::V_ASHRREV_I16_e64:
568 case AMDGPU::V_LSHLREV_B16_e64: {
578 auto Imm = foldToImm(*Src0);
588 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
589 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
592 return std::make_unique<SDWASrcOperand>(
593 Src1, Dst,
BYTE_1,
false,
false,
594 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
595 Opcode != AMDGPU::V_LSHRREV_B16_e64);
600 case AMDGPU::V_BFE_I32_e64:
601 case AMDGPU::V_BFE_U32_e64: {
617 auto Offset = foldToImm(*Src1);
622 auto Width = foldToImm(*Src2);
628 if (*Offset == 0 && *
Width == 8)
630 else if (*Offset == 0 && *
Width == 16)
632 else if (*Offset == 0 && *
Width == 32)
634 else if (*Offset == 8 && *
Width == 8)
636 else if (*Offset == 16 && *
Width == 8)
638 else if (*Offset == 16 && *
Width == 16)
640 else if (*Offset == 24 && *
Width == 8)
651 return std::make_unique<SDWASrcOperand>(
652 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32_e64);
655 case AMDGPU::V_AND_B32_e32:
656 case AMDGPU::V_AND_B32_e64: {
664 auto Imm = foldToImm(*Src0);
667 Imm = foldToImm(*Src1);
671 if (!
Imm || (*
Imm != 0x0000ffff && *
Imm != 0x000000ff))
676 if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
679 return std::make_unique<SDWASrcOperand>(
683 case AMDGPU::V_OR_B32_e32:
684 case AMDGPU::V_OR_B32_e64: {
695 auto CheckOROperandsForSDWA =
697 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
698 return CheckRetType(
None);
702 return CheckRetType(
None);
705 if (!
TII->isSDWA(*Op1Inst))
706 return CheckRetType(
None);
710 return CheckRetType(
None);
712 return CheckRetType(std::make_pair(Op1Def, Op2Def));
717 assert(OrSDWA && OrOther);
718 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
720 OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
721 OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
722 assert(OrSDWA && OrOther);
723 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
730 assert(OrSDWADef && OrOtherDef);
755 if (!
TII->isSDWA(*OtherInst))
759 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
761 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
763 bool DstSelAgree =
false;
766 (OtherDstSel ==
BYTE_3) ||
770 (OtherDstSel ==
BYTE_1) ||
774 (OtherDstSel ==
BYTE_2) ||
775 (OtherDstSel ==
BYTE_3) ||
779 (OtherDstSel ==
BYTE_2) ||
780 (OtherDstSel ==
BYTE_3) ||
784 (OtherDstSel ==
BYTE_1) ||
785 (OtherDstSel ==
BYTE_3) ||
789 (OtherDstSel ==
BYTE_1) ||
790 (OtherDstSel ==
BYTE_2) ||
793 default: DstSelAgree =
false;
801 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
809 return std::make_unique<SDWADstPreserveOperand>(
810 OrDst, OrSDWADef, OrOtherDef, DstSel);
815 return std::unique_ptr<SDWAOperand>(
nullptr);
827 if (
auto Operand = matchSDWAOperand(
MI)) {
830 ++NumSDWAPatternsFound;
855 int Opc =
MI.getOpcode();
856 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
857 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
872 if (!
TII->canShrink(MISucc, *
MRI))
892 if (
I->modifiesRegister(AMDGPU::VCC,
TRI))
899 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
900 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
901 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
904 MI.eraseFromParent();
908 .
add(*
TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
909 .
add(*
TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
910 .
add(*
TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
919 unsigned Opc =
MI.getOpcode();
920 if (
TII->isSDWA(Opc))
930 if (!
ST.hasSDWAOmod() &&
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
933 if (
TII->isVOPC(Opc)) {
934 if (!
ST.hasSDWASdst()) {
936 if (SDst && (SDst->
getReg() != AMDGPU::VCC &&
937 SDst->
getReg() != AMDGPU::VCC_LO))
941 if (!
ST.hasSDWAOutModsVOPC() &&
943 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod)))
946 }
else if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst) ||
947 !
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
951 if (!
ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
952 Opc == AMDGPU::V_FMAC_F32_e32 ||
953 Opc == AMDGPU::V_MAC_F16_e32 ||
954 Opc == AMDGPU::V_MAC_F32_e32))
958 if (
TII->pseudoToMCOpcode(Opc) == -1)
962 if (Opc == AMDGPU::V_CNDMASK_B32_e32)
979 const SDWAOperandsVector &SDWAOperands) {
985 unsigned Opcode =
MI.getOpcode();
986 if (
TII->isSDWA(Opcode)) {
990 if (SDWAOpcode == -1)
1007 }
else if ((Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))) {
1023 if (
auto *Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers))
1027 SDWAInst.
add(*Src0);
1035 if (
auto *Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers))
1039 SDWAInst.
add(*Src1);
1042 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1043 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1044 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1045 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1049 SDWAInst.
add(*Src2);
1056 SDWAInst.
add(*Clamp);
1065 SDWAInst.
add(*OMod);
1075 SDWAInst.
add(*DstSel);
1095 SDWAInst.
add(*Src0Sel);
1105 SDWAInst.
add(*Src1Sel);
1112 auto DstUnused =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
1117 assert(Dst && Dst->isTied());
1118 assert(Opcode ==
static_cast<unsigned int>(SDWAOpcode));
1121 assert(PreserveDstIdx != -1);
1123 auto TiedIdx =
MI.findTiedOperandIdx(PreserveDstIdx);
1124 auto Tied =
MI.getOperand(TiedIdx);
1131 bool Converted =
false;
1132 for (
auto &Operand : SDWAOperands) {
1144 if (PotentialMatches.
count(Operand->getParentInst()) == 0)
1145 Converted |= Operand->convertToSDWA(*SDWAInst,
TII);
1148 ConvertedInstructions.push_back(SDWAInst);
1155 ++NumSDWAInstructionsPeepholed;
1157 MI.eraseFromParent();
1166 unsigned ConstantBusCount = 0;
1168 if (!
Op.isImm() && !(
Op.isReg() && !
TRI->isVGPR(*
MRI,
Op.getReg())))
1171 unsigned I =
MI.getOperandNo(&
Op);
1176 if (
ST.hasSDWAScalar() && ConstantBusCount == 0 &&
Op.isReg() &&
1177 TRI->isSGPRReg(*
MRI,
Op.getReg())) {
1184 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1186 Copy.addImm(
Op.getImm());
1187 else if (
Op.isReg())
1190 Op.ChangeToRegister(VGPR,
false);
1201 TRI =
ST.getRegisterInfo();
1202 TII =
ST.getInstrInfo();
1207 bool Changed =
false;
1213 matchSDWAOperands(
MBB);
1214 for (
const auto &OperandPair : SDWAOperands) {
1215 const auto &Operand = OperandPair.second;
1218 (PotentialMI->
getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1219 PotentialMI->
getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1220 pseudoOpConvertToVOP2(*PotentialMI,
ST);
1222 SDWAOperands.clear();
1225 matchSDWAOperands(
MBB);
1227 for (
const auto &OperandPair : SDWAOperands) {
1228 const auto &Operand = OperandPair.second;
1230 if (PotentialMI && isConvertibleToSDWA(*PotentialMI,
ST)) {
1231 PotentialMatches[PotentialMI].push_back(Operand.get());
1235 for (
auto &PotentialPair : PotentialMatches) {
1237 convertToSDWA(PotentialMI, PotentialPair.second);
1240 PotentialMatches.clear();
1241 SDWAOperands.clear();
1243 Changed = !ConvertedInstructions.empty();
1247 while (!ConvertedInstructions.empty())
1248 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(),
ST);