33#define DEBUG_TYPE "si-peephole-sdwa"
35STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
37 "Number of instruction converted to SDWA.");
56 SDWAOperandsMap PotentialMatches;
67 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
80 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
82 bool runOnMachineFunction(MachineFunction &MF)
override;
84 void getAnalysisUsage(AnalysisUsage &AU)
const override {
94 MachineOperand *Target;
95 MachineOperand *Replaced;
99 virtual bool canCombineSelections(
const MachineInstr &
MI,
100 const SIInstrInfo *
TII) = 0;
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
106 assert(Replaced->isReg());
109 virtual ~SDWAOperand() =
default;
111 virtual MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches =
nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) = 0;
116 MachineOperand *getTargetOperand()
const {
return Target; }
117 MachineOperand *getReplacedOperand()
const {
return Replaced; }
118 MachineInstr *getParentInst()
const {
return Target->getParent(); }
120 MachineRegisterInfo *getMRI()
const {
121 return &getParentInst()->getParent()->getParent()->getRegInfo();
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS)
const = 0;
130class SDWASrcOperand :
public SDWAOperand {
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
144 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
147 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
148 bool canCombineSelections(
const MachineInstr &
MI,
149 const SIInstrInfo *
TII)
override;
151 SdwaSel getSrcSel()
const {
return SrcSel; }
152 bool getAbs()
const {
return Abs; }
153 bool getNeg()
const {
return Neg; }
154 bool getSext()
const {
return Sext; }
156 uint64_t getSrcMods(
const SIInstrInfo *
TII,
157 const MachineOperand *SrcOp)
const;
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS)
const override;
164class SDWADstOperand :
public SDWAOperand {
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
174 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
177 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
178 bool canCombineSelections(
const MachineInstr &
MI,
179 const SIInstrInfo *
TII)
override;
181 SdwaSel getDstSel()
const {
return DstSel; }
182 DstUnused getDstUnused()
const {
return DstUn; }
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS)
const override;
189class SDWADstPreserveOperand :
public SDWADstOperand {
191 MachineOperand *Preserve;
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
197 Preserve(PreserveOp) {}
199 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
200 bool canCombineSelections(
const MachineInstr &
MI,
201 const SIInstrInfo *
TII)
override;
203 MachineOperand *getPreservedOperand()
const {
return Preserve; }
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS)
const override;
215char SIPeepholeSDWALegacy::
ID = 0;
220 return new SIPeepholeSDWALegacy();
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
226 case BYTE_0: OS <<
"BYTE_0";
break;
227 case BYTE_1: OS <<
"BYTE_1";
break;
228 case BYTE_2: OS <<
"BYTE_2";
break;
229 case BYTE_3: OS <<
"BYTE_3";
break;
230 case WORD_0: OS <<
"WORD_0";
break;
231 case WORD_1: OS <<
"WORD_1";
break;
232 case DWORD: OS <<
"DWORD";
break;
248 OS <<
"SDWA src: " << *getTargetOperand()
249 <<
" src_sel:" << getSrcSel()
250 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
251 <<
" sext:" << getSext() <<
'\n';
255void SDWADstOperand::print(raw_ostream& OS)
const {
256 OS <<
"SDWA dst: " << *getTargetOperand()
257 <<
" dst_sel:" << getDstSel()
258 <<
" dst_unused:" << getDstUnused() <<
'\n';
262void SDWADstPreserveOperand::print(raw_ostream& OS)
const {
263 OS <<
"SDWA preserve dst: " << *getTargetOperand()
264 <<
" dst_sel:" << getDstSel()
265 <<
" preserve:" << *getPreservedOperand() <<
'\n';
283 return LHS.isReg() &&
285 LHS.getReg() ==
RHS.getReg() &&
286 LHS.getSubReg() ==
RHS.getSubReg();
291 if (!
Reg->isReg() || !
Reg->isDef())
294 return MRI->getOneNonDBGUse(
Reg->getReg());
302 return MRI->getOneDef(
Reg->getReg());
312 if (Sel == SdwaSel::DWORD)
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
322 if (OperandSel == SdwaSel::WORD_0)
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
337uint64_t SDWASrcOperand::getSrcMods(
const SIInstrInfo *
TII,
338 const MachineOperand *SrcOp)
const {
341 if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0) == SrcOp) {
342 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0_modifiers)) {
343 Mods =
Mod->getImm();
345 }
else if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1) == SrcOp) {
346 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1_modifiers)) {
347 Mods =
Mod->getImm();
352 "Float and integer src modifiers can't be set simultaneously");
362MachineInstr *SDWASrcOperand::potentialToConvert(
const SIInstrInfo *
TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches !=
nullptr) {
367 MachineOperand *
Reg = getReplacedOperand();
368 if (!
Reg->isReg() || !
Reg->isDef())
371 for (MachineInstr &
UseMI : getMRI()->use_nodbg_instructions(
Reg->getReg()))
373 if (!isConvertibleToSDWA(
UseMI, ST,
TII) ||
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(
Reg->getReg())) {
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *
UseMI = UseMO.getParent();
385 potentialMatchesMap[
UseMI].push_back(
this);
392 MachineOperand *PotentialMO =
findSingleRegUse(getReplacedOperand(), getMRI());
396 MachineInstr *Parent = PotentialMO->
getParent();
398 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
401bool SDWASrcOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
402 switch (
MI.getOpcode()) {
403 case AMDGPU::V_CVT_F32_FP8_sdwa:
404 case AMDGPU::V_CVT_F32_BF8_sdwa:
405 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
406 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
409 case AMDGPU::V_CNDMASK_B32_sdwa:
428 bool IsPreserveSrc =
false;
429 MachineOperand *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
430 MachineOperand *SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_sel);
431 MachineOperand *SrcMods =
432 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers);
433 assert(Src && (Src->isReg() || Src->isImm()));
434 if (!
isSameReg(*Src, *getReplacedOperand())) {
436 Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
437 SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_sel);
438 SrcMods =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers);
441 !
isSameReg(*Src, *getReplacedOperand())) {
448 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
450 TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
453 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
459 TII->getNamedImmOperand(
MI, AMDGPU::OpName::dst_sel));
460 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
461 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
462 IsPreserveSrc =
true;
463 auto DstIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
464 AMDGPU::OpName::vdst);
465 auto TiedIdx =
MI.findTiedOperandIdx(DstIdx);
466 Src = &
MI.getOperand(TiedIdx);
475 assert(Src && Src->isReg());
477 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
478 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
479 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
480 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
481 !
isSameReg(*Src, *getReplacedOperand())) {
488 (IsPreserveSrc || (SrcSel && SrcMods)));
491 if (!IsPreserveSrc) {
496 getTargetOperand()->setIsKill(
false);
503 AMDGPU::OpName SrcSelOpName,
SdwaSel OpSel) {
516 AMDGPU::OpName SrcOpName,
528bool SDWASrcOperand::canCombineSelections(
const MachineInstr &
MI,
529 const SIInstrInfo *
TII) {
530 if (!
TII->isSDWA(
MI.getOpcode()))
533 using namespace AMDGPU;
536 getReplacedOperand(), getSrcSel()) &&
538 getReplacedOperand(), getSrcSel());
541MachineInstr *SDWADstOperand::potentialToConvert(
const SIInstrInfo *
TII,
542 const GCNSubtarget &ST,
543 SDWAOperandsMap *PotentialMatches) {
546 MachineRegisterInfo *
MRI = getMRI();
547 MachineInstr *ParentMI = getParentInst();
554 for (MachineInstr &UseInst :
MRI->use_nodbg_instructions(PotentialMO->
getReg())) {
555 if (&UseInst != ParentMI)
559 MachineInstr *Parent = PotentialMO->
getParent();
560 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
563bool SDWADstOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
566 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
567 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
568 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
569 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
575 MachineOperand *Operand =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
578 isSameReg(*Operand, *getReplacedOperand()));
580 MachineOperand *DstSel=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel);
586 MachineOperand *
DstUnused=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
592 getParentInst()->eraseFromParent();
596bool SDWADstOperand::canCombineSelections(
const MachineInstr &
MI,
597 const SIInstrInfo *
TII) {
598 if (!
TII->isSDWA(
MI.getOpcode()))
604bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &
MI,
605 const SIInstrInfo *
TII) {
609 for (MachineOperand &MO :
MI.uses()) {
612 getMRI()->clearKillFlags(MO.getReg());
616 MI.getParent()->remove(&
MI);
617 getParentInst()->getParent()->insert(getParentInst(), &
MI);
620 MachineInstrBuilder MIB(*
MI.getMF(),
MI);
621 MIB.addReg(getPreservedOperand()->
getReg(),
623 getPreservedOperand()->getSubReg());
626 MI.tieOperands(AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdst),
627 MI.getNumOperands() - 1);
630 return SDWADstOperand::convertToSDWA(
MI,
TII);
633bool SDWADstPreserveOperand::canCombineSelections(
const MachineInstr &
MI,
634 const SIInstrInfo *
TII) {
635 return SDWADstOperand::canCombineSelections(
MI,
TII);
638std::optional<int64_t>
639SIPeepholeSDWA::foldToImm(
const MachineOperand &
Op)
const {
647 for (
const MachineOperand &Def :
MRI->def_operands(
Op.getReg())) {
651 const MachineInstr *DefInst =
Def.getParent();
652 if (!
TII->isFoldableCopy(*DefInst))
655 const MachineOperand &Copied = DefInst->
getOperand(1);
666std::unique_ptr<SDWAOperand>
667SIPeepholeSDWA::matchSDWAOperand(MachineInstr &
MI) {
668 unsigned Opcode =
MI.getOpcode();
670 case AMDGPU::V_LSHRREV_B32_e32:
671 case AMDGPU::V_ASHRREV_I32_e32:
672 case AMDGPU::V_LSHLREV_B32_e32:
673 case AMDGPU::V_LSHRREV_B32_e64:
674 case AMDGPU::V_ASHRREV_I32_e64:
675 case AMDGPU::V_LSHLREV_B32_e64: {
684 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
685 auto Imm = foldToImm(*Src0);
689 if (*Imm != 16 && *Imm != 24)
692 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
693 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
695 Dst->getReg().isPhysical())
698 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
699 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
700 return std::make_unique<SDWADstOperand>(
703 return std::make_unique<SDWASrcOperand>(
705 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
706 Opcode != AMDGPU::V_LSHRREV_B32_e64);
710 case AMDGPU::V_LSHRREV_B16_e32:
711 case AMDGPU::V_ASHRREV_I16_e32:
712 case AMDGPU::V_LSHLREV_B16_e32:
713 case AMDGPU::V_LSHRREV_B16_e64:
714 case AMDGPU::V_LSHRREV_B16_opsel_e64:
715 case AMDGPU::V_ASHRREV_I16_e64:
716 case AMDGPU::V_LSHLREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_e64: {
726 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
727 auto Imm = foldToImm(*Src0);
728 if (!Imm || *Imm != 8)
731 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
732 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
735 Dst->getReg().isPhysical())
738 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
739 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_e64)
742 return std::make_unique<SDWASrcOperand>(
743 Src1, Dst,
BYTE_1,
false,
false,
744 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
745 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
746 Opcode != AMDGPU::V_LSHRREV_B16_e64);
750 case AMDGPU::V_BFE_I32_e64:
751 case AMDGPU::V_BFE_U32_e64: {
766 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
767 auto Offset = foldToImm(*Src1);
771 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
772 auto Width = foldToImm(*Src2);
778 if (*
Offset == 0 && *Width == 8)
780 else if (*
Offset == 0 && *Width == 16)
782 else if (*
Offset == 0 && *Width == 32)
784 else if (*
Offset == 8 && *Width == 8)
786 else if (*
Offset == 16 && *Width == 8)
788 else if (*
Offset == 16 && *Width == 16)
790 else if (*
Offset == 24 && *Width == 8)
795 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
796 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
799 Dst->getReg().isPhysical())
802 return std::make_unique<SDWASrcOperand>(
803 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32_e64);
806 case AMDGPU::V_AND_B32_e32:
807 case AMDGPU::V_AND_B32_e64: {
812 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
813 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
815 auto Imm = foldToImm(*Src0);
818 Imm = foldToImm(*Src1);
822 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
825 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
827 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
828 Dst->getReg().isPhysical())
831 return std::make_unique<SDWASrcOperand>(
835 case AMDGPU::V_OR_B32_e32:
836 case AMDGPU::V_OR_B32_e64: {
847 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
848 auto CheckOROperandsForSDWA =
849 [&](
const MachineOperand *Op1,
const MachineOperand *Op2) -> CheckRetType {
850 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
851 return CheckRetType(std::nullopt);
855 return CheckRetType(std::nullopt);
857 MachineInstr *Op1Inst = Op1Def->
getParent();
858 if (!
TII->isSDWA(*Op1Inst))
859 return CheckRetType(std::nullopt);
863 return CheckRetType(std::nullopt);
865 return CheckRetType(std::pair(Op1Def, Op2Def));
868 MachineOperand *OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
869 MachineOperand *OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
870 assert(OrSDWA && OrOther);
871 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
873 OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
874 OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
875 assert(OrSDWA && OrOther);
876 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
881 MachineOperand *OrSDWADef = Res->first;
882 MachineOperand *OrOtherDef = Res->second;
883 assert(OrSDWADef && OrOtherDef);
885 MachineInstr *SDWAInst = OrSDWADef->
getParent();
886 MachineInstr *OtherInst = OrOtherDef->
getParent();
908 if (!
TII->isSDWA(*OtherInst))
912 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
914 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
916 bool DstSelAgree =
false;
919 (OtherDstSel ==
BYTE_3) ||
923 (OtherDstSel ==
BYTE_1) ||
927 (OtherDstSel ==
BYTE_2) ||
928 (OtherDstSel ==
BYTE_3) ||
932 (OtherDstSel ==
BYTE_2) ||
933 (OtherDstSel ==
BYTE_3) ||
937 (OtherDstSel ==
BYTE_1) ||
938 (OtherDstSel ==
BYTE_3) ||
942 (OtherDstSel ==
BYTE_1) ||
943 (OtherDstSel ==
BYTE_2) ||
946 default: DstSelAgree =
false;
954 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
955 if (OtherDstUnused != DstUnused::UNUSED_PAD)
959 MachineOperand *OrDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
962 return std::make_unique<SDWADstPreserveOperand>(
963 OrDst, OrSDWADef, OrOtherDef, DstSel);
968 return std::unique_ptr<SDWAOperand>(
nullptr);
978void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &
MBB) {
979 for (MachineInstr &
MI :
MBB) {
980 if (
auto Operand = matchSDWAOperand(
MI)) {
982 SDWAOperands[&
MI] = std::move(Operand);
983 ++NumSDWAPatternsFound;
1006void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &
MI,
1007 const GCNSubtarget &ST)
const {
1008 int Opc =
MI.getOpcode();
1009 assert((
Opc == AMDGPU::V_ADD_CO_U32_e64 ||
Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1010 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1017 const MachineOperand *Sdst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1023 MachineInstr &MISucc = *NextOp->
getParent();
1026 MachineOperand *CarryIn =
TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1029 MachineOperand *CarryOut =
TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1032 if (!
MRI->hasOneNonDBGUse(CarryIn->
getReg()) ||
1033 !
MRI->use_nodbg_empty(CarryOut->
getReg()))
1036 MachineBasicBlock &
MBB = *
MI.getParent();
1044 if (
I->modifiesRegister(AMDGPU::VCC,
TRI))
1050 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1051 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1052 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1055 MI.eraseFromParent();
1067void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &
MI,
1068 const GCNSubtarget &ST)
const {
1069 assert(
MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1077 const MachineOperand &CarryIn =
1078 *
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1080 MachineInstr *CarryDef =
MRI->getVRegDef(CarryReg);
1087 MCRegister
Vcc =
TRI->getVCC();
1088 MachineBasicBlock &
MBB = *
MI.getParent();
1092 LLVM_DEBUG(
dbgs() <<
"VCC not known to be dead before instruction\n");
1100 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1101 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1102 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1104 TII->fixImplicitOperands(*Converted);
1107 MI.eraseFromParent();
1111bool isConvertibleToSDWA(MachineInstr &
MI,
1112 const GCNSubtarget &ST,
1113 const SIInstrInfo*
TII) {
1115 unsigned Opc =
MI.getOpcode();
1121 if (
Opc == AMDGPU::V_CNDMASK_B32_e64)
1131 if (!
ST.hasSDWAOmod() &&
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1135 if (!
ST.hasSDWASdst()) {
1136 const MachineOperand *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1137 if (SDst && (SDst->
getReg() != AMDGPU::VCC &&
1138 SDst->
getReg() != AMDGPU::VCC_LO))
1142 if (!
ST.hasSDWAOutModsVOPC() &&
1143 (
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) ||
1144 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod)))
1147 }
else if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst) ||
1148 !
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1152 if (!
ST.hasSDWAMac() && (
Opc == AMDGPU::V_FMAC_F16_e32 ||
1153 Opc == AMDGPU::V_FMAC_F32_e32 ||
1154 Opc == AMDGPU::V_MAC_F16_e32 ||
1155 Opc == AMDGPU::V_MAC_F32_e32))
1159 if (
TII->pseudoToMCOpcode(
Opc) == -1)
1162 if (MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0)) {
1167 if (MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1)) {
1176MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &
MI) {
1177 unsigned Opcode =
MI.getOpcode();
1181 if (SDWAOpcode == -1)
1183 assert(SDWAOpcode != -1);
1185 const MCInstrDesc &SDWADesc =
TII->get(SDWAOpcode);
1188 MachineInstrBuilder SDWAInst =
1193 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1197 }
else if ((Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))) {
1207 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1210 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers))
1214 SDWAInst.
add(*Src0);
1217 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1221 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers))
1225 SDWAInst.
add(*Src1);
1228 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1229 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1230 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1231 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1233 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1235 SDWAInst.
add(*Src2);
1240 MachineOperand *Clamp =
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp);
1242 SDWAInst.
add(*Clamp);
1249 MachineOperand *OMod =
TII->getNamedOperand(
MI, AMDGPU::OpName::omod);
1251 SDWAInst.
add(*OMod);
1259 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1262 SDWAInst.
addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1265 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1269 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1274 TII->fixImplicitOperands(*Ret);
1278bool SIPeepholeSDWA::convertToSDWA(MachineInstr &
MI,
1279 const SDWAOperandsVector &SDWAOperands) {
1282 MachineInstr *SDWAInst;
1283 if (
TII->isSDWA(
MI.getOpcode())) {
1287 SDWAInst =
MI.getParent()->getParent()->CloneMachineInstr(&
MI);
1288 MI.getParent()->insert(
MI.getIterator(), SDWAInst);
1290 SDWAInst = createSDWAVersion(
MI);
1294 bool Converted =
false;
1295 for (
auto &Operand : SDWAOperands) {
1307 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1308 Converted |= Operand->convertToSDWA(*SDWAInst,
TII);
1316 ConvertedInstructions.
push_back(SDWAInst);
1317 for (MachineOperand &MO : SDWAInst->
uses()) {
1321 MRI->clearKillFlags(MO.getReg());
1324 ++NumSDWAInstructionsPeepholed;
1326 MI.eraseFromParent();
1332void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &
MI,
1333 const GCNSubtarget &ST)
const {
1334 const MCInstrDesc &
Desc =
TII->get(
MI.getOpcode());
1335 unsigned ConstantBusCount = 0;
1336 for (MachineOperand &
Op :
MI.explicit_uses()) {
1337 if (!
Op.isImm() && !(
Op.isReg() && !
TRI->isVGPR(*
MRI,
Op.getReg())))
1340 unsigned I =
Op.getOperandNo();
1341 if (
Desc.operands()[
I].RegClass == -1 ||
1342 !
TRI->isVSSuperClass(
TRI->getRegClass(
Desc.operands()[
I].RegClass)))
1345 if (
ST.hasSDWAScalar() && ConstantBusCount == 0 &&
Op.isReg() &&
1346 TRI->isSGPRReg(*
MRI,
Op.getReg())) {
1351 Register VGPR =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1353 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1355 Copy.addImm(
Op.getImm());
1356 else if (
Op.isReg())
1359 Op.ChangeToRegister(VGPR,
false);
1363bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1367 return SIPeepholeSDWA().run(MF);
1370bool SIPeepholeSDWA::run(MachineFunction &MF) {
1377 TRI =
ST.getRegisterInfo();
1378 TII =
ST.getInstrInfo();
1382 for (MachineBasicBlock &
MBB : MF) {
1389 matchSDWAOperands(
MBB);
1390 for (
const auto &OperandPair : SDWAOperands) {
1391 const auto &Operand = OperandPair.second;
1392 MachineInstr *PotentialMI = Operand->potentialToConvert(
TII, ST);
1397 case AMDGPU::V_ADD_CO_U32_e64:
1398 case AMDGPU::V_SUB_CO_U32_e64:
1399 pseudoOpConvertToVOP2(*PotentialMI, ST);
1401 case AMDGPU::V_CNDMASK_B32_e64:
1402 convertVcndmaskToVOP2(*PotentialMI, ST);
1406 SDWAOperands.clear();
1409 matchSDWAOperands(
MBB);
1411 for (
const auto &OperandPair : SDWAOperands) {
1412 const auto &Operand = OperandPair.second;
1413 MachineInstr *PotentialMI =
1414 Operand->potentialToConvert(
TII, ST, &PotentialMatches);
1416 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST,
TII))
1417 PotentialMatches[PotentialMI].push_back(Operand.get());
1420 for (
auto &PotentialPair : PotentialMatches) {
1421 MachineInstr &PotentialMI = *PotentialPair.first;
1422 convertToSDWA(PotentialMI, PotentialPair.second);
1425 PotentialMatches.clear();
1426 SDWAOperands.clear();
1432 while (!ConvertedInstructions.
empty())
1433 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(), ST);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static std::optional< SdwaSel > combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel)
Combine an SDWA instruction's existing SDWA selection Sel with the SDWA selection OperandSel of its o...
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, AMDGPU::OpName SrcSelOpName, SdwaSel OpSel)
Verify that the SDWA selection operand SrcSelOpName of the SDWA instruction MI can be combined with t...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mop_range uses()
Returns all operands which may be register uses.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
FunctionPass * createSIPeepholeSDWALegacyPass()
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
char & SIPeepholeSDWALegacyID