42 #include "llvm/Config/llvm-config.h" 52 #include <unordered_map> 56 #define DEBUG_TYPE "si-peephole-sdwa" 58 STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
60 "Number of instruction converted to SDWA.");
76 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
77 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
95 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
98 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
113 : Target(TargetOp), Replaced(ReplacedOp) {
118 virtual ~SDWAOperand() =
default;
128 return &getParentInst()->getParent()->getParent()->getRegInfo();
131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 139 class SDWASrcOperand :
public SDWAOperand {
148 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
150 : SDWAOperand(TargetOp, ReplacedOp),
151 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
156 SdwaSel getSrcSel()
const {
return SrcSel; }
157 bool getAbs()
const {
return Abs; }
158 bool getNeg()
const {
return Neg; }
159 bool getSext()
const {
return Sext; }
164 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 169 class SDWADstOperand :
public SDWAOperand {
178 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
183 SdwaSel getDstSel()
const {
return DstSel; }
184 DstUnused getDstUnused()
const {
return DstUn; }
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 191 class SDWADstPreserveOperand :
public SDWADstOperand {
199 Preserve(PreserveOp) {}
205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 214 char SIPeepholeSDWA::ID = 0;
219 return new SIPeepholeSDWA();
223 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 226 case BYTE_0: OS <<
"BYTE_0";
break;
227 case BYTE_1: OS <<
"BYTE_1";
break;
228 case BYTE_2: OS <<
"BYTE_2";
break;
229 case BYTE_3: OS <<
"BYTE_3";
break;
230 case WORD_0: OS <<
"WORD_0";
break;
231 case WORD_1: OS <<
"WORD_1";
break;
232 case DWORD: OS <<
"DWORD";
break;
253 OS <<
"SDWA src: " << *getTargetOperand()
254 <<
" src_sel:" << getSrcSel()
255 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
256 <<
" sext:" << getSext() <<
'\n';
261 OS <<
"SDWA dst: " << *getTargetOperand()
262 <<
" dst_sel:" << getDstSel()
263 <<
" dst_unused:" << getDstUnused() <<
'\n';
268 OS <<
"SDWA preserve dst: " << *getTargetOperand()
269 <<
" dst_sel:" << getDstSel()
270 <<
" preserve:" << *getPreservedOperand() <<
'\n';
288 return LHS.
isReg() &&
325 for (
auto &DefMO : DefInstr->
defs()) {
326 if (DefMO.isReg() && DefMO.getReg() == Reg->
getReg())
334 uint64_t SDWASrcOperand::getSrcMods(
const SIInstrInfo *TII,
340 Mods =
Mod->getImm();
344 Mods =
Mod->getImm();
349 "Float and integer src modifiers can't be set simulteniously");
372 bool IsPreserveSrc =
false;
378 if (!
isSameReg(*Src, *getReplacedOperand())) {
385 !
isSameReg(*Src, *getReplacedOperand())) {
406 IsPreserveSrc =
true;
408 AMDGPU::OpName::vdst);
421 if ((MI.
getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
422 MI.
getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
423 !
isSameReg(*Src, *getReplacedOperand())) {
430 (IsPreserveSrc || (SrcSel && SrcMods)));
433 if (!IsPreserveSrc) {
434 SrcSel->
setImm(getSrcSel());
435 SrcMods->
setImm(getSrcMods(TII, Src));
437 getTargetOperand()->setIsKill(
false);
453 if (&UseInst != ParentMI)
463 if ((MI.
getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
464 MI.
getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
473 isSameReg(*Operand, *getReplacedOperand()));
477 DstSel->
setImm(getDstSel());
480 DstUnused->
setImm(getDstUnused());
484 getParentInst()->eraseFromParent();
488 bool SDWADstPreserveOperand::convertToSDWA(
MachineInstr &MI,
496 getMRI()->clearKillFlags(MO.getReg());
502 MBB->insert(getParentInst(), &MI);
508 getPreservedOperand()->getSubReg());
515 return SDWADstOperand::convertToSDWA(MI, TII);
545 std::unique_ptr<SDWAOperand>
549 case AMDGPU::V_LSHRREV_B32_e32:
550 case AMDGPU::V_ASHRREV_I32_e32:
551 case AMDGPU::V_LSHLREV_B32_e32:
552 case AMDGPU::V_LSHRREV_B32_e64:
553 case AMDGPU::V_ASHRREV_I32_e64:
554 case AMDGPU::V_LSHLREV_B32_e64: {
564 auto Imm = foldToImm(*Src0);
568 if (*Imm != 16 && *Imm != 24)
573 if (TRI->isPhysicalRegister(Src1->
getReg()) ||
574 TRI->isPhysicalRegister(Dst->
getReg()))
577 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
578 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
579 return make_unique<SDWADstOperand>(
582 return make_unique<SDWASrcOperand>(
584 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
585 Opcode != AMDGPU::V_LSHRREV_B32_e64);
590 case AMDGPU::V_LSHRREV_B16_e32:
591 case AMDGPU::V_ASHRREV_I16_e32:
592 case AMDGPU::V_LSHLREV_B16_e32:
593 case AMDGPU::V_LSHRREV_B16_e64:
594 case AMDGPU::V_ASHRREV_I16_e64:
595 case AMDGPU::V_LSHLREV_B16_e64: {
605 auto Imm = foldToImm(*Src0);
606 if (!Imm || *Imm != 8)
612 if (TRI->isPhysicalRegister(Src1->
getReg()) ||
613 TRI->isPhysicalRegister(Dst->
getReg()))
616 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
617 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
620 return make_unique<SDWASrcOperand>(
621 Src1, Dst,
BYTE_1,
false,
false,
622 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
623 Opcode != AMDGPU::V_LSHRREV_B16_e64);
628 case AMDGPU::V_BFE_I32:
629 case AMDGPU::V_BFE_U32: {
645 auto Offset = foldToImm(*Src1);
650 auto Width = foldToImm(*Src2);
656 if (*
Offset == 0 && *Width == 8)
658 else if (*
Offset == 0 && *Width == 16)
660 else if (*
Offset == 0 && *Width == 32)
662 else if (*
Offset == 8 && *Width == 8)
664 else if (*
Offset == 16 && *Width == 8)
666 else if (*
Offset == 16 && *Width == 16)
668 else if (*
Offset == 24 && *Width == 8)
676 if (TRI->isPhysicalRegister(Src0->
getReg()) ||
677 TRI->isPhysicalRegister(Dst->
getReg()))
680 return make_unique<SDWASrcOperand>(
681 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32);
684 case AMDGPU::V_AND_B32_e32:
685 case AMDGPU::V_AND_B32_e64: {
693 auto Imm = foldToImm(*Src0);
696 Imm = foldToImm(*Src1);
700 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
705 if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
706 TRI->isPhysicalRegister(Dst->
getReg()))
709 return make_unique<SDWASrcOperand>(
713 case AMDGPU::V_OR_B32_e32:
714 case AMDGPU::V_OR_B32_e64: {
725 auto CheckOROperandsForSDWA =
727 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
728 return CheckRetType(
None);
732 return CheckRetType(
None);
735 if (!TII->
isSDWA(*Op1Inst))
736 return CheckRetType(
None);
740 return CheckRetType(
None);
742 return CheckRetType(std::make_pair(Op1Def, Op2Def));
747 assert(OrSDWA && OrOther);
748 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
752 assert(OrSDWA && OrOther);
753 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
760 assert(OrSDWADef && OrOtherDef);
785 if (!TII->
isSDWA(*OtherInst))
793 bool DstSelAgree =
false;
796 (OtherDstSel ==
BYTE_3) ||
800 (OtherDstSel ==
BYTE_1) ||
804 (OtherDstSel ==
BYTE_2) ||
805 (OtherDstSel ==
BYTE_3) ||
809 (OtherDstSel ==
BYTE_2) ||
810 (OtherDstSel ==
BYTE_3) ||
814 (OtherDstSel ==
BYTE_1) ||
815 (OtherDstSel ==
BYTE_3) ||
819 (OtherDstSel ==
BYTE_1) ||
820 (OtherDstSel ==
BYTE_2) ||
823 default: DstSelAgree =
false;
839 return make_unique<SDWADstPreserveOperand>(
840 OrDst, OrSDWADef, OrOtherDef, DstSel);
845 return std::unique_ptr<SDWAOperand>(
nullptr);
850 if (
auto Operand = matchSDWAOperand(MI)) {
851 LLVM_DEBUG(
dbgs() <<
"Match: " << MI <<
"To: " << *Operand <<
'\n');
852 SDWAOperands[&
MI] = std::move(Operand);
853 ++NumSDWAPatternsFound;
876 void SIPeepholeSDWA::pseudoOpConvertToVOP2(
MachineInstr &MI,
879 assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
880 "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
915 if (
I->modifiesRegister(AMDGPU::VCC, TRI))
933 bool SIPeepholeSDWA::isConvertibleToSDWA(
MachineInstr &MI,
953 if (SDst && SDst->
getReg() != AMDGPU::VCC)
967 if (!ST.
hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
968 Opc == AMDGPU::V_MAC_F32_e32))
972 if (Opc == AMDGPU::V_CNDMASK_B32_e32)
979 const SDWAOperandsVector &SDWAOperands) {
986 if (TII->
isSDWA(Opcode)) {
990 if (SDWAOpcode == -1)
995 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1026 SDWAInst.
add(*Src0);
1038 SDWAInst.
add(*Src1);
1041 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1042 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1046 SDWAInst.
add(*Src2);
1053 SDWAInst.
add(*Clamp);
1062 SDWAInst.
add(*OMod);
1072 SDWAInst.
add(*DstSel);
1082 SDWAInst.
add(*DstUnused);
1092 SDWAInst.
add(*Src0Sel);
1102 SDWAInst.
add(*Src1Sel);
1114 assert(Dst && Dst->isTied());
1115 assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1118 assert(PreserveDstIdx != -1);
1128 bool Converted =
false;
1129 for (
auto &Operand : SDWAOperands) {
1141 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1142 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1145 ConvertedInstructions.
push_back(SDWAInst);
1152 ++NumSDWAInstructionsPeepholed;
1160 void SIPeepholeSDWA::legalizeScalarOperands(
MachineInstr &MI,
1163 unsigned ConstantBusCount = 0;
1181 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1183 Copy.addImm(Op.
getImm());
1184 else if (Op.
isReg())
1204 bool Changed =
false;
1210 matchSDWAOperands(MBB);
1211 for (
const auto &OperandPair : SDWAOperands) {
1212 const auto &Operand = OperandPair.second;
1213 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1215 (PotentialMI->
getOpcode() == AMDGPU::V_ADD_I32_e64 ||
1216 PotentialMI->
getOpcode() == AMDGPU::V_SUB_I32_e64))
1217 pseudoOpConvertToVOP2(*PotentialMI, ST);
1219 SDWAOperands.clear();
1222 matchSDWAOperands(MBB);
1224 for (
const auto &OperandPair : SDWAOperands) {
1225 const auto &Operand = OperandPair.second;
1226 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1227 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1228 PotentialMatches[PotentialMI].push_back(Operand.get());
1232 for (
auto &PotentialPair : PotentialMatches) {
1234 convertToSDWA(PotentialMI, PotentialPair.second);
1237 PotentialMatches.clear();
1238 SDWAOperands.clear();
1240 Changed = !ConvertedInstructions.
empty();
1244 while (!ConvertedInstructions.
empty())
1245 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(),
ST);
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
A common definition of LaneBitmask for use in TableGen and CodeGen.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
FunctionPass * createSIPeepholeSDWAPass()
This class represents lattice values for constants.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
void ChangeToRegister(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value...
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, unsigned Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before...
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
iterator_range< use_nodbg_iterator > use_nodbg_operands(unsigned Reg) const
Describe properties that are true of each instruction in the target description file.
unsigned getReg() const
getReg - Returns the register number.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
void setIsUndef(bool Val=true)
unsigned getSubReg() const
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
const SIInstrInfo * getInstrInfo() const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
void setIsDead(bool Val=true)
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool hasSDWAOutModsVOPC() const
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
iterator_range< def_iterator > def_operands(unsigned Reg) const
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
bool isFoldableCopy(const MachineInstr &MI) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool hasVGPRs(const TargetRegisterClass *RC) const
void initializeSIPeepholeSDWAPass(PassRegistry &)
Register is known to be fully dead.
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
iterator_range< mop_iterator > explicit_uses()
void setIsKill(bool Val=true)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
unsigned findTiedOperandIdx(unsigned OpIdx) const
Given the index of a tied register operand, find the operand it is tied to.
BlockVerifier::State From
LLVM_READONLY int getVOPe32(uint16_t Opcode)
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
LLVM_NODISCARD T pop_back_val()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
The access may modify the value stored in memory.
Target - Wrapper for Target specific information.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool use_empty(unsigned RegNo) const
use_empty - Return true if there are no instructions using the specified register.
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool hasOneUse(unsigned RegNo) const
hasOneUse - Return true if there is exactly one instruction using the specified register.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
static bool isVOPC(const MachineInstr &MI)
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
void setReg(unsigned Reg)
Change the register this operand corresponds to.
void setSubReg(unsigned subReg)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool hasSDWAScalar() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSDWA(const MachineInstr &MI)
const MCOperandInfo * OpInfo
This class implements an extremely fast bulk output stream that can only output to a stream...
StringRef - Represent a constant reference to a string, i.e.
const MachineOperand & getOperand(unsigned i) const
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(unsigned Reg) const
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const SIRegisterInfo * getRegisterInfo() const override