doxygen/SIPeepholeSDWA_8cpp_source.html

//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass tries to apply several peephole SDWA patterns.

///

/// E.g. original:

///   V_LSHRREV_B32_e32 %0, 16, %1

///   V_ADD_CO_U32_e32 %2, %0, %3

///   V_LSHLREV_B32_e32 %4, 16, %2

///

/// Replace:

///   V_ADD_CO_U32_sdwa %4, %1, %3

///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD

///

//===----------------------------------------------------------------------===//


#include "SIPeepholeSDWA.h"

#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include <optional>


using namespace llvm;


#define DEBUG_TYPE "si-peephole-sdwa"


STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");

STATISTIC(NumSDWAInstructionsPeepholed,

          "Number of instruction converted to SDWA.");


namespace {


bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,

                         const SIInstrInfo *TII);

class SDWAOperand;

class SDWADstOperand;


using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;

using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;


class SIPeepholeSDWA {

private:

  MachineRegisterInfo *MRI;

  const SIRegisterInfo *TRI;

  const SIInstrInfo *TII;


  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;

  SDWAOperandsMap PotentialMatches;

  SmallVector<MachineInstr *, 8> ConvertedInstructions;


  std::optional<int64_t> foldToImm(const MachineOperand &Op) const;


  void matchSDWAOperands(MachineBasicBlock &MBB);

  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);

  void pseudoOpConvertToVOP2(MachineInstr &MI,

                             const GCNSubtarget &ST) const;

  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);

  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;


public:

  bool run(MachineFunction &MF);

};


class SIPeepholeSDWALegacy : public MachineFunctionPass {

public:

  static char ID;


  SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}


  StringRef getPassName() const override { return "SI Peephole SDWA"; }


  bool runOnMachineFunction(MachineFunction &MF) override;


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    MachineFunctionPass::getAnalysisUsage(AU);

  }

};


class SDWAOperand {

private:

  MachineOperand *Target; // Operand that would be used in converted instruction

  MachineOperand *Replaced; // Operand that would be replace by Target


public:

  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)

      : Target(TargetOp), Replaced(ReplacedOp) {

    assert(Target->isReg());

    assert(Replaced->isReg());

  }


  virtual ~SDWAOperand() = default;


  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,

                                           const GCNSubtarget &ST,

                                           SDWAOperandsMap *PotentialMatches = nullptr) = 0;

  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;


  MachineOperand *getTargetOperand() const { return Target; }

  MachineOperand *getReplacedOperand() const { return Replaced; }

  MachineInstr *getParentInst() const { return Target->getParent(); }


  MachineRegisterInfo *getMRI() const {

    return &getParentInst()->getParent()->getParent()->getRegInfo();

  }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  virtual void print(raw_ostream& OS) const = 0;

  void dump() const { print(dbgs()); }

#endif

};


using namespace AMDGPU::SDWA;


class SDWASrcOperand : public SDWAOperand {

private:

  SdwaSel SrcSel;

  bool Abs;

  bool Neg;

  bool Sext;


public:

  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,

                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,

                 bool Sext_ = false)

      : SDWAOperand(TargetOp, ReplacedOp),

        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}


  MachineInstr *potentialToConvert(const SIInstrInfo *TII,

                                   const GCNSubtarget &ST,

                                   SDWAOperandsMap *PotentialMatches = nullptr) override;

  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;


  SdwaSel getSrcSel() const { return SrcSel; }

  bool getAbs() const { return Abs; }

  bool getNeg() const { return Neg; }

  bool getSext() const { return Sext; }


  uint64_t getSrcMods(const SIInstrInfo *TII,

                      const MachineOperand *SrcOp) const;


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  void print(raw_ostream& OS) const override;

#endif

};


class SDWADstOperand : public SDWAOperand {

private:

  SdwaSel DstSel;

  DstUnused DstUn;


public:


  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,

                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)

    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}


  MachineInstr *potentialToConvert(const SIInstrInfo *TII,

                                   const GCNSubtarget &ST,

                                   SDWAOperandsMap *PotentialMatches = nullptr) override;

  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;


  SdwaSel getDstSel() const { return DstSel; }

  DstUnused getDstUnused() const { return DstUn; }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  void print(raw_ostream& OS) const override;

#endif

};


class SDWADstPreserveOperand : public SDWADstOperand {

private:

  MachineOperand *Preserve;


public:

  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,

                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)

      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),

        Preserve(PreserveOp) {}


  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;


  MachineOperand *getPreservedOperand() const { return Preserve; }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  void print(raw_ostream& OS) const override;

#endif

};


} // end anonymous namespace


INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,

                false)


char SIPeepholeSDWALegacy::ID = 0;


char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;


FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {

  return new SIPeepholeSDWALegacy();

}


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {

  switch(Sel) {

  case BYTE_0: OS << "BYTE_0"; break;

  case BYTE_1: OS << "BYTE_1"; break;

  case BYTE_2: OS << "BYTE_2"; break;

  case BYTE_3: OS << "BYTE_3"; break;

  case WORD_0: OS << "WORD_0"; break;

  case WORD_1: OS << "WORD_1"; break;

  case DWORD:  OS << "DWORD"; break;

  }

  return OS;

}


static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {

  switch(Un) {

  case UNUSED_PAD: OS << "UNUSED_PAD"; break;

  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;

  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;

  }

  return OS;

}


LLVM_DUMP_METHOD

void SDWASrcOperand::print(raw_ostream& OS) const {

  OS << "SDWA src: " << *getTargetOperand()

    << " src_sel:" << getSrcSel()

    << " abs:" << getAbs() << " neg:" << getNeg()

    << " sext:" << getSext() << '\n';

}


LLVM_DUMP_METHOD

void SDWADstOperand::print(raw_ostream& OS) const {

  OS << "SDWA dst: " << *getTargetOperand()

    << " dst_sel:" << getDstSel()

    << " dst_unused:" << getDstUnused() << '\n';

}


LLVM_DUMP_METHOD

void SDWADstPreserveOperand::print(raw_ostream& OS) const {

  OS << "SDWA preserve dst: " << *getTargetOperand()

    << " dst_sel:" << getDstSel()

    << " preserve:" << *getPreservedOperand() << '\n';

}


#endif


static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {

  assert(To.isReg() && From.isReg());

  To.setReg(From.getReg());

  To.setSubReg(From.getSubReg());

  To.setIsUndef(From.isUndef());

  if (To.isUse()) {

    To.setIsKill(From.isKill());

  } else {

    To.setIsDead(From.isDead());

  }

}


static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {

  return LHS.isReg() &&

         RHS.isReg() &&

         LHS.getReg() == RHS.getReg() &&

         LHS.getSubReg() == RHS.getSubReg();

}


static MachineOperand *findSingleRegUse(const MachineOperand *Reg,

                                        const MachineRegisterInfo *MRI) {

  if (!Reg->isReg() || !Reg->isDef())

    return nullptr;


  MachineOperand *ResMO = nullptr;

  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {

    // If there exist use of subreg of Reg then return nullptr

    if (!isSameReg(UseMO, *Reg))

      return nullptr;


    // Check that there is only one instruction that uses Reg

    if (!ResMO) {

      ResMO = &UseMO;

    } else if (ResMO->getParent() != UseMO.getParent()) {

      return nullptr;

    }

  }


  return ResMO;

}


static MachineOperand *findSingleRegDef(const MachineOperand *Reg,

                                        const MachineRegisterInfo *MRI) {

  if (!Reg->isReg())

    return nullptr;


  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());

  if (!DefInstr)

    return nullptr;


  for (auto &DefMO : DefInstr->defs()) {

    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())

      return &DefMO;

  }


  // Ignore implicit defs.

  return nullptr;

}


uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,

                                    const MachineOperand *SrcOp) const {

  uint64_t Mods = 0;

  const auto *MI = SrcOp->getParent();

  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {

    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {

      Mods = Mod->getImm();

    }

  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {

    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {

      Mods = Mod->getImm();

    }

  }

  if (Abs || Neg) {

    assert(!Sext &&

           "Float and integer src modifiers can't be set simultaneously");

    Mods |= Abs ? SISrcMods::ABS : 0u;

    Mods ^= Neg ? SISrcMods::NEG : 0u;

  } else if (Sext) {

    Mods |= SISrcMods::SEXT;

  }


  return Mods;

}


MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,

                                                 const GCNSubtarget &ST,

                                                 SDWAOperandsMap *PotentialMatches) {

  if (PotentialMatches != nullptr) {

    // Fill out the map for all uses if all can be converted

    MachineOperand *Reg = getReplacedOperand();

    if (!Reg->isReg() || !Reg->isDef())

      return nullptr;


    for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))

      // Check that all instructions that use Reg can be converted

      if (!isConvertibleToSDWA(UseMI, ST, TII))

        return nullptr;


    // Now that it's guaranteed all uses are legal, iterate over the uses again

    // to add them for later conversion.

    for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {

      // Should not get a subregister here

      assert(isSameReg(UseMO, *Reg));


      SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;

      MachineInstr *UseMI = UseMO.getParent();

      potentialMatchesMap[UseMI].push_back(this);

    }

    return nullptr;

  }


  // For SDWA src operand potential instruction is one that use register

  // defined by parent instruction

  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());

  if (!PotentialMO)

    return nullptr;


  return PotentialMO->getParent();

}


bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {

  switch (MI.getOpcode()) {

  case AMDGPU::V_CVT_F32_FP8_sdwa:

  case AMDGPU::V_CVT_F32_BF8_sdwa:

  case AMDGPU::V_CVT_PK_F32_FP8_sdwa:

  case AMDGPU::V_CVT_PK_F32_BF8_sdwa:

    // Does not support input modifiers: noabs, noneg, nosext.

    return false;

  }


  // Find operand in instruction that matches source operand and replace it with

  // target operand. Set corresponding src_sel

  bool IsPreserveSrc = false;

  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);

  MachineOperand *SrcMods =

      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);

  assert(Src && (Src->isReg() || Src->isImm()));

  if (!isSameReg(*Src, *getReplacedOperand())) {

    // If this is not src0 then it could be src1

    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);

    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);


    if (!Src ||

        !isSameReg(*Src, *getReplacedOperand())) {

      // It's possible this Src is a tied operand for

      // UNUSED_PRESERVE, in which case we can either

      // abandon the peephole attempt, or if legal we can

      // copy the target operand into the tied slot

      // if the preserve operation will effectively cause the same

      // result by overwriting the rest of the dst.

      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

      MachineOperand *DstUnused =

        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);


      if (Dst &&

          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {

        // This will work if the tied src is accessing WORD_0, and the dst is

        // writing WORD_1. Modifiers don't matter because all the bits that

        // would be impacted are being overwritten by the dst.

        // Any other case will not work.

        SdwaSel DstSel = static_cast<SdwaSel>(

            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));

        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&

            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {

          IsPreserveSrc = true;

          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

                                                   AMDGPU::OpName::vdst);

          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);

          Src = &MI.getOperand(TiedIdx);

          SrcSel = nullptr;

          SrcMods = nullptr;

        } else {

          // Not legal to convert this src

          return false;

        }

      }

    }

    assert(Src && Src->isReg());


    if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||

         MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||

         MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||

         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&

         !isSameReg(*Src, *getReplacedOperand())) {

      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to

      // src2. This is not allowed.

      return false;

    }


    assert(isSameReg(*Src, *getReplacedOperand()) &&

           (IsPreserveSrc || (SrcSel && SrcMods)));

  }

  copyRegOperand(*Src, *getTargetOperand());

  if (!IsPreserveSrc) {

    SrcSel->setImm(getSrcSel());

    SrcMods->setImm(getSrcMods(TII, Src));

  }

  getTargetOperand()->setIsKill(false);

  return true;

}


MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,

                                                 const GCNSubtarget &ST,

                                                 SDWAOperandsMap *PotentialMatches) {

  // For SDWA dst operand potential instruction is one that defines register

  // that this operand uses

  MachineRegisterInfo *MRI = getMRI();

  MachineInstr *ParentMI = getParentInst();


  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);

  if (!PotentialMO)

    return nullptr;


  // Check that ParentMI is the only instruction that uses replaced register

  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {

    if (&UseInst != ParentMI)

      return nullptr;

  }


  return PotentialMO->getParent();

}


bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {

  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused


  if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||

       MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||

       MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||

       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&

      getDstSel() != AMDGPU::SDWA::DWORD) {

    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD

    return false;

  }


  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

  assert(Operand &&

         Operand->isReg() &&

         isSameReg(*Operand, *getReplacedOperand()));

  copyRegOperand(*Operand, *getTargetOperand());

  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);

  assert(DstSel);

  DstSel->setImm(getDstSel());

  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);

  assert(DstUnused);

  DstUnused->setImm(getDstUnused());


  // Remove original instruction  because it would conflict with our new

  // instruction by register definition

  getParentInst()->eraseFromParent();

  return true;

}


bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,

                                           const SIInstrInfo *TII) {

  // MI should be moved right before v_or_b32.

  // For this we should clear all kill flags on uses of MI src-operands or else

  // we can encounter problem with use of killed operand.

  for (MachineOperand &MO : MI.uses()) {

    if (!MO.isReg())

      continue;

    getMRI()->clearKillFlags(MO.getReg());

  }


  // Move MI before v_or_b32

  MI.getParent()->remove(&MI);

  getParentInst()->getParent()->insert(getParentInst(), &MI);


  // Add Implicit use of preserved register

  MachineInstrBuilder MIB(*MI.getMF(), MI);

  MIB.addReg(getPreservedOperand()->getReg(),

             RegState::ImplicitKill,

             getPreservedOperand()->getSubReg());


  // Tie dst to implicit use

  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),

                 MI.getNumOperands() - 1);


  // Convert MI as any other SDWADstOperand and remove v_or_b32

  return SDWADstOperand::convertToSDWA(MI, TII);

}


std::optional<int64_t>

SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {

  if (Op.isImm()) {

    return Op.getImm();

  }


  // If this is not immediate then it can be copy of immediate value, e.g.:

  // %1 = S_MOV_B32 255;

  if (Op.isReg()) {

    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {

      if (!isSameReg(Op, Def))

        continue;


      const MachineInstr *DefInst = Def.getParent();

      if (!TII->isFoldableCopy(*DefInst))

        return std::nullopt;


      const MachineOperand &Copied = DefInst->getOperand(1);

      if (!Copied.isImm())

        return std::nullopt;


      return Copied.getImm();

    }

  }


  return std::nullopt;

}


std::unique_ptr<SDWAOperand>

SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {

  unsigned Opcode = MI.getOpcode();

  switch (Opcode) {

  case AMDGPU::V_LSHRREV_B32_e32:

  case AMDGPU::V_ASHRREV_I32_e32:

  case AMDGPU::V_LSHLREV_B32_e32:

  case AMDGPU::V_LSHRREV_B32_e64:

  case AMDGPU::V_ASHRREV_I32_e64:

  case AMDGPU::V_LSHLREV_B32_e64: {

    // from: v_lshrrev_b32_e32 v1, 16/24, v0

    // to SDWA src:v0 src_sel:WORD_1/BYTE_3


    // from: v_ashrrev_i32_e32 v1, 16/24, v0

    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1


    // from: v_lshlrev_b32_e32 v1, 16/24, v0

    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD

    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

    auto Imm = foldToImm(*Src0);

    if (!Imm)

      break;


    if (*Imm != 16 && *Imm != 24)

      break;


    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

    if (!Src1->isReg() || Src1->getReg().isPhysical() ||

        Dst->getReg().isPhysical())

      break;


    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||

        Opcode == AMDGPU::V_LSHLREV_B32_e64) {

      return std::make_unique<SDWADstOperand>(

          Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);

    }

    return std::make_unique<SDWASrcOperand>(

        Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,

        Opcode != AMDGPU::V_LSHRREV_B32_e32 &&

            Opcode != AMDGPU::V_LSHRREV_B32_e64);

    break;

  }


  case AMDGPU::V_LSHRREV_B16_e32:

  case AMDGPU::V_ASHRREV_I16_e32:

  case AMDGPU::V_LSHLREV_B16_e32:

  case AMDGPU::V_LSHRREV_B16_e64:

  case AMDGPU::V_ASHRREV_I16_e64:

  case AMDGPU::V_LSHLREV_B16_e64: {

    // from: v_lshrrev_b16_e32 v1, 8, v0

    // to SDWA src:v0 src_sel:BYTE_1


    // from: v_ashrrev_i16_e32 v1, 8, v0

    // to SDWA src:v0 src_sel:BYTE_1 sext:1


    // from: v_lshlrev_b16_e32 v1, 8, v0

    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD

    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

    auto Imm = foldToImm(*Src0);

    if (!Imm || *Imm != 8)

      break;


    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);


    if (!Src1->isReg() || Src1->getReg().isPhysical() ||

        Dst->getReg().isPhysical())

      break;


    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||

        Opcode == AMDGPU::V_LSHLREV_B16_e64)

      return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);

    return std::make_unique<SDWASrcOperand>(

        Src1, Dst, BYTE_1, false, false,

        Opcode != AMDGPU::V_LSHRREV_B16_e32 &&

            Opcode != AMDGPU::V_LSHRREV_B16_e64);

    break;

  }


  case AMDGPU::V_BFE_I32_e64:

  case AMDGPU::V_BFE_U32_e64: {

    // e.g.:

    // from: v_bfe_u32 v1, v0, 8, 8

    // to SDWA src:v0 src_sel:BYTE_1


    // offset | width | src_sel

    // ------------------------

    // 0      | 8     | BYTE_0

    // 0      | 16    | WORD_0

    // 0      | 32    | DWORD ?

    // 8      | 8     | BYTE_1

    // 16     | 8     | BYTE_2

    // 16     | 16    | WORD_1

    // 24     | 8     | BYTE_3


    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    auto Offset = foldToImm(*Src1);

    if (!Offset)

      break;


    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);

    auto Width = foldToImm(*Src2);

    if (!Width)

      break;


    SdwaSel SrcSel = DWORD;


    if (*Offset == 0 && *Width == 8)

      SrcSel = BYTE_0;

    else if (*Offset == 0 && *Width == 16)

      SrcSel = WORD_0;

    else if (*Offset == 0 && *Width == 32)

      SrcSel = DWORD;

    else if (*Offset == 8 && *Width == 8)

      SrcSel = BYTE_1;

    else if (*Offset == 16 && *Width == 8)

      SrcSel = BYTE_2;

    else if (*Offset == 16 && *Width == 16)

      SrcSel = WORD_1;

    else if (*Offset == 24 && *Width == 8)

      SrcSel = BYTE_3;

    else

      break;


    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);


    if (!Src0->isReg() || Src0->getReg().isPhysical() ||

        Dst->getReg().isPhysical())

      break;


    return std::make_unique<SDWASrcOperand>(

          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);

  }


  case AMDGPU::V_AND_B32_e32:

  case AMDGPU::V_AND_B32_e64: {

    // e.g.:

    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0

    // to SDWA src:v0 src_sel:WORD_0/BYTE_0


    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    auto *ValSrc = Src1;

    auto Imm = foldToImm(*Src0);


    if (!Imm) {

      Imm = foldToImm(*Src1);

      ValSrc = Src0;

    }


    if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))

      break;


    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);


    if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||

        Dst->getReg().isPhysical())

      break;


    return std::make_unique<SDWASrcOperand>(

        ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);

  }


  case AMDGPU::V_OR_B32_e32:

  case AMDGPU::V_OR_B32_e64: {

    // Patterns for dst_unused:UNUSED_PRESERVE.

    // e.g., from:

    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD

    //                           src1_sel:WORD_1 src2_sel:WORD1

    // v_add_f16_e32 v3, v1, v2

    // v_or_b32_e32 v4, v0, v3

    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3


    // Check if one of operands of v_or_b32 is SDWA instruction

    using CheckRetType =

        std::optional<std::pair<MachineOperand *, MachineOperand *>>;

    auto CheckOROperandsForSDWA =

      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {

        if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())

          return CheckRetType(std::nullopt);


        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);

        if (!Op1Def)

          return CheckRetType(std::nullopt);


        MachineInstr *Op1Inst = Op1Def->getParent();

        if (!TII->isSDWA(*Op1Inst))

          return CheckRetType(std::nullopt);


        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);

        if (!Op2Def)

          return CheckRetType(std::nullopt);


        return CheckRetType(std::pair(Op1Def, Op2Def));

      };


    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

    assert(OrSDWA && OrOther);

    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);

    if (!Res) {

      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

      assert(OrSDWA && OrOther);

      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);

      if (!Res)

        break;

    }


    MachineOperand *OrSDWADef = Res->first;

    MachineOperand *OrOtherDef = Res->second;

    assert(OrSDWADef && OrOtherDef);


    MachineInstr *SDWAInst = OrSDWADef->getParent();

    MachineInstr *OtherInst = OrOtherDef->getParent();


    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their

    // destination patterns don't overlap. Compatible instruction can be either

    // regular instruction with compatible bitness or SDWA instruction with

    // correct dst_sel

    // SDWAInst | OtherInst bitness / OtherInst dst_sel

    // -----------------------------------------------------

    // DWORD    | no                    / no

    // WORD_0   | no                    / BYTE_2/3, WORD_1

    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0

    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1

    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1

    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0

    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0

    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK

    // but v_add_f32 is not.


    // TODO: add support for non-SDWA instructions as OtherInst.

    // For now this only works with SDWA instructions. For regular instructions

    // there is no way to determine if the instruction writes only 8/16/24-bit

    // out of full register size and all registers are at min 32-bit wide.

    if (!TII->isSDWA(*OtherInst))

      break;


    SdwaSel DstSel = static_cast<SdwaSel>(

        TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));

    SdwaSel OtherDstSel = static_cast<SdwaSel>(

      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));


    bool DstSelAgree = false;

    switch (DstSel) {

    case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||

                                (OtherDstSel == BYTE_3) ||

                                (OtherDstSel == WORD_1));

      break;

    case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||

                                (OtherDstSel == BYTE_1) ||

                                (OtherDstSel == WORD_0));

      break;

    case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||

                                (OtherDstSel == BYTE_2) ||

                                (OtherDstSel == BYTE_3) ||

                                (OtherDstSel == WORD_1));

      break;

    case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||

                                (OtherDstSel == BYTE_2) ||

                                (OtherDstSel == BYTE_3) ||

                                (OtherDstSel == WORD_1));

      break;

    case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||

                                (OtherDstSel == BYTE_1) ||

                                (OtherDstSel == BYTE_3) ||

                                (OtherDstSel == WORD_0));

      break;

    case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||

                                (OtherDstSel == BYTE_1) ||

                                (OtherDstSel == BYTE_2) ||

                                (OtherDstSel == WORD_0));

      break;

    default: DstSelAgree = false;

    }


    if (!DstSelAgree)

      break;


    // Also OtherInst dst_unused should be UNUSED_PAD

    DstUnused OtherDstUnused = static_cast<DstUnused>(

      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));

    if (OtherDstUnused != DstUnused::UNUSED_PAD)

      break;


    // Create DstPreserveOperand

    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

    assert(OrDst && OrDst->isReg());


    return std::make_unique<SDWADstPreserveOperand>(

      OrDst, OrSDWADef, OrOtherDef, DstSel);


  }

  }


  return std::unique_ptr<SDWAOperand>(nullptr);

}


#if !defined(NDEBUG)

static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {

  Operand.print(OS);

  return OS;

}

#endif


void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {

  for (MachineInstr &MI : MBB) {

    if (auto Operand = matchSDWAOperand(MI)) {

      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');

      SDWAOperands[&MI] = std::move(Operand);

      ++NumSDWAPatternsFound;

    }

  }

}


// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows

// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into

// V_ADD_CO_U32_sdwa.

//

// We are transforming from a VOP3 into a VOP2 form of the instruction.

//   %19:vgpr_32 = V_AND_B32_e32 255,

//       killed %16:vgpr_32, implicit $exec

//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64

//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec

//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64

//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec

//

// becomes

//   %47:vgpr_32 = V_ADD_CO_U32_sdwa

//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,

//       implicit-def $vcc, implicit $exec

//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64

//       %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec

void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,

                                           const GCNSubtarget &ST) const {

  int Opc = MI.getOpcode();

  assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&

         "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");


  // Can the candidate MI be shrunk?

  if (!TII->canShrink(MI, *MRI))

    return;

  Opc = AMDGPU::getVOPe32(Opc);

  // Find the related ADD instruction.

  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);

  if (!Sdst)

    return;

  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);

  if (!NextOp)

    return;

  MachineInstr &MISucc = *NextOp->getParent();


  // Make sure the carry in/out are subsequently unused.

  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);

  if (!CarryIn)

    return;

  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);

  if (!CarryOut)

    return;

  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))

    return;

  // Make sure VCC or its subregs are dead before MI.

  MachineBasicBlock &MBB = *MI.getParent();

  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);

  if (Liveness != MachineBasicBlock::LQR_Dead)

    return;

  // Check if VCC is referenced in range of (MI,MISucc].

  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();

       I != E; ++I) {

    if (I->modifiesRegister(AMDGPU::VCC, TRI))

      return;

  }


  // Replace MI with V_{SUB|ADD}_I32_e32

  BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))

    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))

    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))

    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))

    .setMIFlags(MI.getFlags());


  MI.eraseFromParent();


  // Since the carry output of MI is now VCC, update its use in MISucc.


  MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);

}


namespace {

bool isConvertibleToSDWA(MachineInstr &MI,

                         const GCNSubtarget &ST,

                         const SIInstrInfo* TII) {

  // Check if this is already an SDWA instruction

  unsigned Opc = MI.getOpcode();

  if (TII->isSDWA(Opc)) {

    // FIXME: Reenable after fixing selection handling.

    // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll

    return false;

  }


  // Check if this instruction has opcode that supports SDWA

  if (AMDGPU::getSDWAOp(Opc) == -1)

    Opc = AMDGPU::getVOPe32(Opc);


  if (AMDGPU::getSDWAOp(Opc) == -1)

    return false;


  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))

    return false;


  if (TII->isVOPC(Opc)) {

    if (!ST.hasSDWASdst()) {

      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);

      if (SDst && (SDst->getReg() != AMDGPU::VCC &&

                   SDst->getReg() != AMDGPU::VCC_LO))

        return false;

    }


    if (!ST.hasSDWAOutModsVOPC() &&

        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||

         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))

      return false;


  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||

             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {

    return false;

  }


  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||

                           Opc == AMDGPU::V_FMAC_F32_e32 ||

                           Opc == AMDGPU::V_MAC_F16_e32 ||

                           Opc == AMDGPU::V_MAC_F32_e32))

    return false;


  // Check if target supports this SDWA opcode

  if (TII->pseudoToMCOpcode(Opc) == -1)

    return false;


  // FIXME: has SDWA but require handling of implicit VCC use

  if (Opc == AMDGPU::V_CNDMASK_B32_e32)

    return false;


  if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {

    if (!Src0->isReg() && !Src0->isImm())

      return false;

  }


  if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {

    if (!Src1->isReg() && !Src1->isImm())

      return false;

  }


  return true;

}

} // namespace


bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,

                                   const SDWAOperandsVector &SDWAOperands) {


  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);


  // Convert to sdwa

  int SDWAOpcode;

  unsigned Opcode = MI.getOpcode();

  if (TII->isSDWA(Opcode)) {

    SDWAOpcode = Opcode;

  } else {

    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);

    if (SDWAOpcode == -1)

      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));

  }

  assert(SDWAOpcode != -1);


  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);


  // Create SDWA version of instruction MI and initialize its operands

  MachineInstrBuilder SDWAInst =

    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)

    .setMIFlags(MI.getFlags());


  // Copy dst, if it is present in original then should also be present in SDWA

  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

  if (Dst) {

    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));

    SDWAInst.add(*Dst);

  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {

    assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));

    SDWAInst.add(*Dst);

  } else {

    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));

    SDWAInst.addReg(TRI->getVCC(), RegState::Define);

  }


  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and

  // src0_modifiers (except for v_nop_sdwa, but it can't get here)

  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&

         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));

  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))

    SDWAInst.addImm(Mod->getImm());

  else

    SDWAInst.addImm(0);

  SDWAInst.add(*Src0);


  // Copy src1 if present, initialize src1_modifiers.

  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

  if (Src1) {

    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&

           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));

    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))

      SDWAInst.addImm(Mod->getImm());

    else

      SDWAInst.addImm(0);

    SDWAInst.add(*Src1);

  }


  if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||

      SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||

      SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||

      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {

    // v_mac_f16/32 has additional src2 operand tied to vdst

    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);

    assert(Src2);

    SDWAInst.add(*Src2);

  }


  // Copy clamp if present, initialize otherwise

  assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));

  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);

  if (Clamp) {

    SDWAInst.add(*Clamp);

  } else {

    SDWAInst.addImm(0);

  }


  // Copy omod if present, initialize otherwise if needed

  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {

    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);

    if (OMod) {

      SDWAInst.add(*OMod);

    } else {

      SDWAInst.addImm(0);

    }

  }


  // Copy dst_sel if present, initialize otherwise if needed

  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) {

    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);

    if (DstSel) {

      SDWAInst.add(*DstSel);

    } else {

      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);

    }

  }


  // Copy dst_unused if present, initialize otherwise if needed

  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) {

    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);

    if (DstUnused) {

      SDWAInst.add(*DstUnused);

    } else {

      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);

    }

  }


  // Copy src0_sel if present, initialize otherwise

  assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));

  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);

  if (Src0Sel) {

    SDWAInst.add(*Src0Sel);

  } else {

    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);

  }


  // Copy src1_sel if present, initialize otherwise if needed

  if (Src1) {

    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));

    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);

    if (Src1Sel) {

      SDWAInst.add(*Src1Sel);

    } else {

      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);

    }

  }


  // Check for a preserved register that needs to be copied.

  auto *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);

  if (DstUnused &&

      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {

    // We expect, if we are here, that the instruction was already in it's SDWA form,

    // with a tied operand.

    assert(Dst && Dst->isTied());

    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));

    // We also expect a vdst, since sdst can't preserve.

    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);

    assert(PreserveDstIdx != -1);


    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);

    auto Tied = MI.getOperand(TiedIdx);


    SDWAInst.add(Tied);

    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);

  }


  // Apply all sdwa operand patterns.

  bool Converted = false;

  for (auto &Operand : SDWAOperands) {

    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);

    // There should be no intersection between SDWA operands and potential MIs

    // e.g.:

    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0

    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0

    // v_add_u32 v3, v4, v2

    //

    // In that example it is possible that we would fold 2nd instruction into

    // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that

    // was already destroyed). So if SDWAOperand is also a potential MI then do

    // not apply it.

    if (PotentialMatches.count(Operand->getParentInst()) == 0)

      Converted |= Operand->convertToSDWA(*SDWAInst, TII);

  }


  if (Converted) {

    ConvertedInstructions.push_back(SDWAInst);

    for (MachineOperand &MO : SDWAInst->uses()) {

      if (!MO.isReg())

        continue;


      MRI->clearKillFlags(MO.getReg());

    }

  } else {

    SDWAInst->eraseFromParent();

    return false;

  }


  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');

  ++NumSDWAInstructionsPeepholed;


  MI.eraseFromParent();

  return true;

}


// If an instruction was converted to SDWA it should not have immediates or SGPR

// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.

void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,

                                            const GCNSubtarget &ST) const {

  const MCInstrDesc &Desc = TII->get(MI.getOpcode());

  unsigned ConstantBusCount = 0;

  for (MachineOperand &Op : MI.explicit_uses()) {

    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))

      continue;


    unsigned I = Op.getOperandNo();

    if (Desc.operands()[I].RegClass == -1 ||

        !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))

      continue;


    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&

        TRI->isSGPRReg(*MRI, Op.getReg())) {

      ++ConstantBusCount;

      continue;

    }


    Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),

                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);

    if (Op.isImm())

      Copy.addImm(Op.getImm());

    else if (Op.isReg())

      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,

                  Op.getSubReg());

    Op.ChangeToRegister(VGPR, false);

  }

}


bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {

  if (skipFunction(MF.getFunction()))

    return false;


  return SIPeepholeSDWA().run(MF);

}


bool SIPeepholeSDWA::run(MachineFunction &MF) {

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();


  if (!ST.hasSDWA())

    return false;


  MRI = &MF.getRegInfo();

  TRI = ST.getRegisterInfo();

  TII = ST.getInstrInfo();


  // Find all SDWA operands in MF.

  bool Ret = false;

  for (MachineBasicBlock &MBB : MF) {

    bool Changed = false;

    do {

      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.

      // Look for a possible ADD or SUB that resulted from a previously lowered

      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2

      // lowers the pair of instructions into e32 form.

      matchSDWAOperands(MBB);

      for (const auto &OperandPair : SDWAOperands) {

        const auto &Operand = OperandPair.second;

        MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);

        if (PotentialMI &&

           (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||

            PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))

          pseudoOpConvertToVOP2(*PotentialMI, ST);

      }

      SDWAOperands.clear();


      // Generate potential match list.

      matchSDWAOperands(MBB);


      for (const auto &OperandPair : SDWAOperands) {

        const auto &Operand = OperandPair.second;

        MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);

        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {

          PotentialMatches[PotentialMI].push_back(Operand.get());

        }

      }


      for (auto &PotentialPair : PotentialMatches) {

        MachineInstr &PotentialMI = *PotentialPair.first;

        convertToSDWA(PotentialMI, PotentialPair.second);

      }


      PotentialMatches.clear();

      SDWAOperands.clear();


      Changed = !ConvertedInstructions.empty();


      if (Changed)

        Ret = true;

      while (!ConvertedInstructions.empty())

        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);

    } while (Changed);

  }


  return Ret;

}


PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,

                                          MachineFunctionAnalysisManager &) {

  if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))

    return PreservedAnalyses::all();


  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();

  PA.preserveSet<CFGAnalyses>();

  return PA;

}

MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105

UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:112

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition: ARMSLSHardening.cpp:71

print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:205

From
BlockVerifier::State From
Definition: BlockVerifier.cpp:57

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition: Debug.h:106

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:112

I
#define I(x, y, z)
Definition: MD5.cpp:58

MachineFunctionPass.h

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:2029

MapVector.h
This file implements a map that provides insertion order iteration.

getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:520

Mod
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
Definition: PassBuilderBindings.cpp:95

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

findSingleRegDef
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
Definition: SIPeepholeSDWA.cpp:299

copyRegOperand
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
Definition: SIPeepholeSDWA.cpp:258

findSingleRegUse
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
Definition: SIPeepholeSDWA.cpp:277

isSameReg
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
Definition: SIPeepholeSDWA.cpp:270

operator<<
static raw_ostream & operator<<(raw_ostream &OS, SdwaSel Sel)
Definition: SIPeepholeSDWA.cpp:212

DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIPeepholeSDWA.cpp:33

SIPeepholeSDWA.h

OS
raw_pwrite_stream & OS
Definition: SampleProfWriter.cpp:51

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166

RHS
Value * RHS
Definition: X86PartialReduction.cpp:74

LHS
Value * LHS
Definition: X86PartialReduction.cpp:73

char

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47

llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310

llvm::Function::hasOptNone
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:701

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:125

llvm::MachineBasicBlock::computeRegisterLiveness
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
Definition: MachineBasicBlock.cpp:1632

llvm::MachineBasicBlock::LQR_Dead
@ LQR_Dead
Register is known to be fully dead.
Definition: MachineBasicBlock.h:1180

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:169

llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunction
Definition: MachineFunction.h:267

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:733

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:743

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:704

llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:71

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:133

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:226

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:99

llvm::MachineInstrBuilder::setMIFlags
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Definition: MachineInstrBuilder.h:275

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:71

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349

llvm::MachineInstr::uses
iterator_range< mop_iterator > uses()
Returns a range that includes all operands which may be register uses.
Definition: MachineInstr.h:741

llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:580

llvm::MachineInstr::substituteRegister
void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
Definition: MachineInstr.cpp:1279

llvm::MachineInstr::tieOperands
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
Definition: MachineInstr.cpp:1168

llvm::MachineInstr::defs
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:730

llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:767

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48

llvm::MachineOperand::setSubReg
void setSubReg(unsigned subReg)
Definition: MachineOperand.h:490

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:685

llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:556

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:329

llvm::MachineOperand::setIsDead
void setIsDead(bool Val=true)
Definition: MachineOperand.h:525

llvm::MachineOperand::setReg
void setReg(Register Reg)
Change the register this operand corresponds to.
Definition: MachineOperand.cpp:61

llvm::MachineOperand::isUse
bool isUse() const
Definition: MachineOperand.h:379

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:331

llvm::MachineOperand::setIsKill
void setIsKill(bool Val=true)
Definition: MachineOperand.h:519

llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition: MachineOperand.h:243

llvm::MachineOperand::setIsUndef
void setIsUndef(bool Val=true)
Definition: MachineOperand.h:530

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:369

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:51

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117

llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95

llvm::SIInstrInfo
Definition: SIInstrInfo.h:85

llvm::SIPeepholeSDWAPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition: SIPeepholeSDWA.cpp:1314

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:32

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196

llvm::SrcOp
Definition: MachineIRBuilder.h:142

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51

llvm::Target
Target - Wrapper for Target specific information.
Definition: TargetRegistry.h:144

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:132

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52

uint64_t

unsigned

llvm::AMDGPU::SDWA::DstUnused
DstUnused
Definition: SIDefines.h:913

llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition: SIDefines.h:916

llvm::AMDGPU::SDWA::UNUSED_PAD
@ UNUSED_PAD
Definition: SIDefines.h:914

llvm::AMDGPU::SDWA::UNUSED_SEXT
@ UNUSED_SEXT
Definition: SIDefines.h:915

llvm::AMDGPU::SDWA::SdwaSel
SdwaSel
Definition: SIDefines.h:903

llvm::AMDGPU::SDWA::WORD_1
@ WORD_1
Definition: SIDefines.h:909

llvm::AMDGPU::SDWA::BYTE_3
@ BYTE_3
Definition: SIDefines.h:907

llvm::AMDGPU::SDWA::BYTE_2
@ BYTE_2
Definition: SIDefines.h:906

llvm::AMDGPU::SDWA::BYTE_1
@ BYTE_1
Definition: SIDefines.h:905

llvm::AMDGPU::SDWA::BYTE_0
@ BYTE_0
Definition: SIDefines.h:904

llvm::AMDGPU::SDWA::DWORD
@ DWORD
Definition: SIDefines.h:910

llvm::AMDGPU::SDWA::WORD_0
@ WORD_0
Definition: SIDefines.h:908

llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)

llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)

llvm::AMDGPU::Imm
@ Imm
Definition: AMDGPURegBankLegalizeRules.h:105

llvm::AMDGPU::getSDWAOp
LLVM_READONLY int getSDWAOp(uint16_t Opcode)

llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition: AMDGPUBaseInfo.h:400

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::M68k::MemAddrModeKind::u
@ u

llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:117

llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:46

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:50

llvm::RegState::ImplicitKill
@ ImplicitKill
Definition: MachineInstrBuilder.h:66

llvm::SISrcMods::SEXT
@ SEXT
Definition: SIDefines.h:290

llvm::SISrcMods::ABS
@ ABS
Definition: SIDefines.h:289

llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:288

llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:621

llvm::codeview::FrameCookieKind::Copy
@ Copy

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877

llvm::Offset
@ Offset
Definition: DWP.cpp:480

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:373

llvm::createSIPeepholeSDWALegacyPass
FunctionPass * createSIPeepholeSDWALegacyPass()
Definition: SIPeepholeSDWA.cpp:207

llvm::getMachineFunctionPassPreservedAnalyses
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition: MachinePassManager.cpp:158

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::SIPeepholeSDWALegacyID
char & SIPeepholeSDWALegacyID

llvm::DWARFExpression::Operation::Description
Description of the encoding of one expression Op.
Definition: DWARFExpression.h:66