docs/doxygen/AArch64SIMDInstrOpt_8cpp_source.html

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file contains a pass that performs optimization on SIMD instructions

// with high latency by splitting them into more efficient series of

// instructions.

//

// 1. Rewrite certain SIMD instructions with vector element due to their

// inefficiency on some targets.

//

// For example:

//    fmla v0.4s, v1.4s, v2.s[1]

//

// Is rewritten into:

//    dup v3.4s, v2.s[1]

//    fmla v0.4s, v1.4s, v3.4s

//

// 2. Rewrite interleaved memory access instructions due to their

// inefficiency on some targets.

//

// For example:

//    st2 {v0.4s, v1.4s}, addr

//

// Is rewritten into:

//    zip1 v2.4s, v0.4s, v1.4s

//    zip2 v3.4s, v0.4s, v1.4s

//    stp  q2, q3,  addr

//

//===----------------------------------------------------------------------===//


#include "AArch64InstrInfo.h"

#include "AArch64Subtarget.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringRef.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/MachineInstr.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineOperand.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/TargetInstrInfo.h"

#include "llvm/CodeGen/TargetSchedule.h"

#include "llvm/CodeGen/TargetSubtargetInfo.h"

#include "llvm/MC/MCInstrDesc.h"

#include "llvm/MC/MCSchedule.h"

#include "llvm/Pass.h"

#include <map>

#include <unordered_map>


using namespace llvm;


#define DEBUG_TYPE "aarch64-simd-instr-opt"


STATISTIC(NumModifiedInstr,

          "Number of SIMD instructions modified");


#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \

  "AArch64 SIMD instructions optimization pass"


namespace {


// A costly instruction is replaced in this work by N efficient instructions

// The maximum of N is currently 10 and it is for ST4 case.

constexpr unsigned MaxNumRepl = 10;


class AArch64SIMDInstrOptImpl {

public:

  const AArch64InstrInfo *TII;

  MachineRegisterInfo *MRI;

  TargetSchedModel SchedModel;


  using SIMDInstrTableMap = std::map<std::pair<unsigned, std::string>, bool>;


  using InterlEarlyExitMap = std::unordered_map<std::string, bool>;


  // The two maps below are used to cache decisions instead of recomputing. Note

  // that we're only storing references, the data is scoped at the Pass level to

  // enable the caching.

  //

  // This is used to cache instruction replacement decisions within function

  // units and across function units.

  SIMDInstrTableMap &SIMDInstrTable;


  // This is used to cache the decision of whether to leave the interleaved

  // store instructions replacement pass early or not for a particular target.

  InterlEarlyExitMap &InterlEarlyExit;


  typedef enum {

    VectorElem,

    Interleave

  } Subpass;


  // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.

  struct InstReplInfo {

    unsigned OrigOpc;

    unsigned ReplOpc[MaxNumRepl];

    unsigned NumRepl;

    const TargetRegisterClass *RC;

  };


#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)                               \

  {OpcOrg, {OpcR0, OpcR1, OpcR2}, 3, &RC}


#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6,       \

                OpcR7, OpcR8, OpcR9, RC)                                       \

  {OpcOrg,                                                                     \

   {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9},     \

   10,                                                                         \

   &RC}


  AArch64SIMDInstrOptImpl(SIMDInstrTableMap &SIMDInstrTable,

                          InterlEarlyExitMap &InterlEarlyExit)

      : SIMDInstrTable(SIMDInstrTable), InterlEarlyExit(InterlEarlyExit) {}


  /// Based only on latency of instructions, determine if it is cost efficient

  /// to replace the instruction InstDesc by the instructions stored in the

  /// array InstDescRepl.

  /// Return true if replacement is expected to be faster.

  bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,

                         SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);


  /// Determine if we need to exit the instruction replacement optimization

  /// passes early. This makes sure that no compile time is spent in this pass

  /// for targets with no need for any of these optimizations.

  /// Return true if early exit of the pass is recommended.

  bool shouldExitEarly(MachineFunction *MF, Subpass SP);


  /// Check whether an equivalent DUP instruction has already been

  /// created or not.

  /// Return true when the DUP instruction already exists. In this case,

  /// DestReg will point to the destination of the already created DUP.

  bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,

                unsigned LaneNumber, unsigned *DestReg) const;


  /// Certain SIMD instructions with vector element operand are not efficient.

  /// Rewrite them into SIMD instructions with vector operands. This rewrite

  /// is driven by the latency of the instructions.

  /// Return true if the SIMD instruction is modified.

  bool optimizeVectElement(MachineInstr &MI);


  /// Process The REG_SEQUENCE instruction, and extract the source

  /// operands of the ST2/4 instruction from it.

  /// Example of such instructions.

  ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;

  /// Return true when the instruction is processed successfully.

  bool processSeqRegInst(MachineInstr *DefiningMI, unsigned *StReg,

                         RegState *StRegKill, unsigned NumArg) const;


  /// Load/Store Interleaving instructions are not always beneficial.

  /// Replace them by ZIP instructionand classical load/store.

  /// Return true if the SIMD instruction is modified.

  bool optimizeLdStInterleave(MachineInstr &MI);


  /// Return the number of useful source registers for this

  /// instruction (2 for ST2 and 4 for ST4).

  unsigned determineSrcReg(MachineInstr &MI) const;


  bool run(MachineFunction &MF);

};


struct AArch64SIMDInstrOptLegacy : public MachineFunctionPass {

  static char ID;


  AArch64SIMDInstrOptImpl::SIMDInstrTableMap SIMDInstrTable;

  AArch64SIMDInstrOptImpl::InterlEarlyExitMap InterlEarlyExit;


  AArch64SIMDInstrOptLegacy() : MachineFunctionPass(ID) {}


  bool runOnMachineFunction(MachineFunction &Fn) override;


  StringRef getPassName() const override {

    return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;

  }

};


char AArch64SIMDInstrOptLegacy::ID = 0;


// The Instruction Replacement Table.

constexpr AArch64SIMDInstrOptImpl::InstReplInfo IRT[] = {

    // ST2 instructions

    RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

            AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

            AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

            AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

            AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

            AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

            AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

            AArch64::STPDi, AArch64::FPR64RegClass),

    // ST4 instructions

    RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

            AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,

            AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

            AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,

            AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

            AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,

            AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

            AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,

            AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

            AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,

            AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

            AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,

            AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

            AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,

            AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)};


} // end anonymous namespace


INITIALIZE_PASS(AArch64SIMDInstrOptLegacy, "aarch64-simd-instr-opt",

                AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)


/// Based only on latency of instructions, determine if it is cost efficient

/// to replace the instruction InstDesc by the instructions stored in the

/// array InstDescRepl.

/// Return true if replacement is expected to be faster.

bool AArch64SIMDInstrOptImpl::shouldReplaceInst(

    MachineFunction *MF, const MCInstrDesc *InstDesc,

    SmallVectorImpl<const MCInstrDesc *> &InstDescRepl) {

  // Check if replacement decision is already available in the cached table.

  // if so, return it.

  std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());

  auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);

  auto It = SIMDInstrTable.find(InstID);

  if (It != SIMDInstrTable.end())

    return It->second;


  unsigned SCIdx = InstDesc->getSchedClass();

  const MCSchedClassDesc *SCDesc =

    SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);


  // If a target does not define resources for the instructions

  // of interest, then return false for no replacement.

  const MCSchedClassDesc *SCDescRepl;

  if (!SCDesc->isValid() || SCDesc->isVariant())

  {

    SIMDInstrTable[InstID] = false;

    return false;

  }

  for (const auto *IDesc : InstDescRepl)

  {

    SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(

      IDesc->getSchedClass());

    if (!SCDescRepl->isValid() || SCDescRepl->isVariant())

    {

      SIMDInstrTable[InstID] = false;

      return false;

    }

  }


  // Replacement cost.

  unsigned ReplCost = 0;

  for (const auto *IDesc :InstDescRepl)

    ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());


  if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)

  {

    SIMDInstrTable[InstID] = true;

    return true;

  }

  else

  {

    SIMDInstrTable[InstID] = false;

    return false;

  }

}


/// Determine if we need to exit this pass for a kind of instruction replacement

/// early. This makes sure that no compile time is spent in this pass for

/// targets with no need for any of these optimizations beyond performing this

/// check.

/// Return true if early exit of this pass for a kind of instruction

/// replacement is recommended for a target.

bool AArch64SIMDInstrOptImpl::shouldExitEarly(MachineFunction *MF, Subpass SP) {

  const MCInstrDesc *OriginalMCID;

  SmallVector<const MCInstrDesc *, MaxNumRepl> ReplInstrMCID;


  switch (SP) {

  // For this optimization, check by comparing the latency of a representative

  // instruction to that of the replacement instructions.

  // TODO: check for all concerned instructions.

  case VectorElem:

    OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);

    ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));

    ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));

    if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))

      return false;

    break;


  // For this optimization, check for all concerned instructions.

  case Interleave:

    std::string Subtarget =

        std::string(SchedModel.getSubtargetInfo()->getCPU());

    auto It = InterlEarlyExit.find(Subtarget);

    if (It != InterlEarlyExit.end())

      return It->second;


    for (const auto &I : IRT) {

      OriginalMCID = &TII->get(I.OrigOpc);

      for (unsigned J = 0; J < I.NumRepl; ++J)

        ReplInstrMCID.push_back(&TII->get(I.ReplOpc[J]));

      if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {

        InterlEarlyExit[Subtarget] = false;

        return false;

      }

      ReplInstrMCID.clear();

    }

    InterlEarlyExit[Subtarget] = true;

    break;

  }


  return true;

}


/// Check whether an equivalent DUP instruction has already been

/// created or not.

/// Return true when the DUP instruction already exists. In this case,

/// DestReg will point to the destination of the already created DUP.

bool AArch64SIMDInstrOptImpl::reuseDUP(MachineInstr &MI, unsigned DupOpcode,

                                       unsigned SrcReg, unsigned LaneNumber,

                                       unsigned *DestReg) const {

  for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();

       MII != MIE;) {

    MII--;

    MachineInstr *CurrentMI = &*MII;


    if (CurrentMI->getOpcode() == DupOpcode &&

        CurrentMI->getNumOperands() == 3 &&

        CurrentMI->getOperand(1).getReg() == SrcReg &&

        CurrentMI->getOperand(2).getImm() == LaneNumber) {

      *DestReg = CurrentMI->getOperand(0).getReg();

      return true;

    }

  }


  return false;

}


/// Certain SIMD instructions with vector element operand are not efficient.

/// Rewrite them into SIMD instructions with vector operands. This rewrite

/// is driven by the latency of the instructions.

/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,

/// and FMULX and hence they are hardcoded.

///

/// For example:

///    fmla v0.4s, v1.4s, v2.s[1]

///

/// Is rewritten into

///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant

///    fmla v0.4s, v1.4s, v3.4s

///

/// Return true if the SIMD instruction is modified.

bool AArch64SIMDInstrOptImpl::optimizeVectElement(MachineInstr &MI) {

  const MCInstrDesc *MulMCID, *DupMCID;

  const TargetRegisterClass *RC = &AArch64::FPR128RegClass;


  switch (MI.getOpcode()) {

  default:

    return false;


  // 4X32 instructions

  case AArch64::FMLAv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMLAv4f32);

    break;

  case AArch64::FMLSv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMLSv4f32);

    break;

  case AArch64::FMULXv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMULXv4f32);

    break;

  case AArch64::FMULv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMULv4f32);

    break;


  // 2X64 instructions

  case AArch64::FMLAv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMLAv2f64);

    break;

  case AArch64::FMLSv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMLSv2f64);

    break;

  case AArch64::FMULXv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMULXv2f64);

    break;

  case AArch64::FMULv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMULv2f64);

    break;


  // 2X32 instructions

  case AArch64::FMLAv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMLAv2f32);

    break;

  case AArch64::FMLSv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMLSv2f32);

    break;

  case AArch64::FMULXv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMULXv2f32);

    break;

  case AArch64::FMULv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMULv2f32);

    break;

  }


  SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;

  ReplInstrMCID.push_back(DupMCID);

  ReplInstrMCID.push_back(MulMCID);

  if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),

                         ReplInstrMCID))

    return false;


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock &MBB = *MI.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  // Get the operands of the current SIMD arithmetic instruction.

  Register MulDest = MI.getOperand(0).getReg();

  Register SrcReg0 = MI.getOperand(1).getReg();

  RegState Src0IsKill = getKillRegState(MI.getOperand(1).isKill());

  Register SrcReg1 = MI.getOperand(2).getReg();

  RegState Src1IsKill = getKillRegState(MI.getOperand(2).isKill());

  unsigned DupDest;


  // Instructions of interest have either 4 or 5 operands.

  if (MI.getNumOperands() == 5) {

    Register SrcReg2 = MI.getOperand(3).getReg();

    RegState Src2IsKill = getKillRegState(MI.getOperand(3).isKill());

    unsigned LaneNumber = MI.getOperand(4).getImm();

    // Create a new DUP instruction. Note that if an equivalent DUP instruction

    // has already been created before, then use that one instead of creating

    // a new one.

    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {

      DupDest = MRI.createVirtualRegister(RC);

      BuildMI(MBB, MI, DL, *DupMCID, DupDest)

          .addReg(SrcReg2, Src2IsKill)

          .addImm(LaneNumber);

    }

    BuildMI(MBB, MI, DL, *MulMCID, MulDest)

        .addReg(SrcReg0, Src0IsKill)

        .addReg(SrcReg1, Src1IsKill)

        .addReg(DupDest, Src2IsKill);

  } else if (MI.getNumOperands() == 4) {

    unsigned LaneNumber = MI.getOperand(3).getImm();

    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {

      DupDest = MRI.createVirtualRegister(RC);

      BuildMI(MBB, MI, DL, *DupMCID, DupDest)

          .addReg(SrcReg1, Src1IsKill)

          .addImm(LaneNumber);

    }

    BuildMI(MBB, MI, DL, *MulMCID, MulDest)

        .addReg(SrcReg0, Src0IsKill)

        .addReg(DupDest, Src1IsKill);

  } else {

    return false;

  }


  ++NumModifiedInstr;

  return true;

}


/// Load/Store Interleaving instructions are not always beneficial.

/// Replace them by ZIP instructions and classical load/store.

///

/// For example:

///    st2 {v0.4s, v1.4s}, addr

///

/// Is rewritten into:

///    zip1 v2.4s, v0.4s, v1.4s

///    zip2 v3.4s, v0.4s, v1.4s

///    stp  q2, q3, addr

//

/// For example:

///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr

///

/// Is rewritten into:

///    zip1 v4.4s, v0.4s, v2.4s

///    zip2 v5.4s, v0.4s, v2.4s

///    zip1 v6.4s, v1.4s, v3.4s

///    zip2 v7.4s, v1.4s, v3.4s

///    zip1 v8.4s, v4.4s, v6.4s

///    zip2 v9.4s, v4.4s, v6.4s

///    zip1 v10.4s, v5.4s, v7.4s

///    zip2 v11.4s, v5.4s, v7.4s

///    stp  q8, q9, addr

///    stp  q10, q11, addr+32

///

/// Currently only instructions related to ST2 and ST4 are considered.

/// Other may be added later.

/// Return true if the SIMD instruction is modified.

bool AArch64SIMDInstrOptImpl::optimizeLdStInterleave(MachineInstr &MI) {


  unsigned SeqReg, AddrReg;

  unsigned StReg[4];

  RegState StRegKill[4];

  MachineInstr *DefiningMI;

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock &MBB = *MI.getParent();

  SmallVector<unsigned, MaxNumRepl> ZipDest;

  SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;


  // If current instruction matches any of the rewriting rules, then

  // gather information about parameters of the new instructions.

  bool Match = false;

  for (const auto &I : IRT) {

    if (MI.getOpcode() == I.OrigOpc) {

      SeqReg = MI.getOperand(0).getReg();

      AddrReg = MI.getOperand(1).getReg();

      DefiningMI = MRI->getUniqueVRegDef(SeqReg);

      unsigned NumReg = determineSrcReg(MI);

      if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))

        return false;


      for (unsigned J = 0; J < I.NumRepl; ++J) {

        unsigned Repl = I.ReplOpc[J];

        ReplInstrMCID.push_back(&TII->get(Repl));

        // Generate destination registers but only for non-store instruction.

        if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)

          ZipDest.push_back(MRI->createVirtualRegister(I.RC));

      }

      Match = true;

      break;

    }

  }


  if (!Match)

    return false;


  // Determine if it is profitable to replace MI by the series of instructions

  // represented in ReplInstrMCID.

  if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),

                         ReplInstrMCID))

    return false;


  // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at

  // this point, the code generation is hardcoded and does not rely on the IRT

  // table used above given that code generation for ST2 replacement is somewhat

  // different than for ST4 replacement. We could have added more info into the

  // table related to how we build new instructions but we may be adding more

  // complexity with that).

  switch (MI.getOpcode()) {

  default:

    return false;


  case AArch64::ST2Twov16b:

  case AArch64::ST2Twov8b:

  case AArch64::ST2Twov8h:

  case AArch64::ST2Twov4h:

  case AArch64::ST2Twov4s:

  case AArch64::ST2Twov2s:

  case AArch64::ST2Twov2d:

    // ZIP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])

        .addReg(StReg[0])

        .addReg(StReg[1]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])

        .addReg(StReg[0], StRegKill[0])

        .addReg(StReg[1], StRegKill[1]);

    // STP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[2])

        .addReg(ZipDest[0])

        .addReg(ZipDest[1])

        .addReg(AddrReg)

        .addImm(0);

    break;


  case AArch64::ST4Fourv16b:

  case AArch64::ST4Fourv8b:

  case AArch64::ST4Fourv8h:

  case AArch64::ST4Fourv4h:

  case AArch64::ST4Fourv4s:

  case AArch64::ST4Fourv2s:

  case AArch64::ST4Fourv2d:

    // ZIP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])

        .addReg(StReg[0])

        .addReg(StReg[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])

        .addReg(StReg[0], StRegKill[0])

        .addReg(StReg[2], StRegKill[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])

        .addReg(StReg[1])

        .addReg(StReg[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])

        .addReg(StReg[1], StRegKill[1])

        .addReg(StReg[3], StRegKill[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])

        .addReg(ZipDest[0])

        .addReg(ZipDest[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])

        .addReg(ZipDest[0])

        .addReg(ZipDest[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])

        .addReg(ZipDest[1])

        .addReg(ZipDest[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])

        .addReg(ZipDest[1])

        .addReg(ZipDest[3]);

    // stp instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[8])

        .addReg(ZipDest[4])

        .addReg(ZipDest[5])

        .addReg(AddrReg)

        .addImm(0);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[9])

        .addReg(ZipDest[6])

        .addReg(ZipDest[7])

        .addReg(AddrReg)

        .addImm(2);

    break;

  }


  ++NumModifiedInstr;

  return true;

}


/// Process The REG_SEQUENCE instruction, and extract the source

/// operands of the ST2/4 instruction from it.

/// Example of such instruction.

///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;

/// Return true when the instruction is processed successfully.

bool AArch64SIMDInstrOptImpl::processSeqRegInst(MachineInstr *DefiningMI,

                                                unsigned *StReg,

                                                RegState *StRegKill,

                                                unsigned NumArg) const {

  assert(DefiningMI != nullptr);

  if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)

    return false;


  for (unsigned i=0; i<NumArg; i++) {

    StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();

    StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());


    // Validation check for the other arguments.

    if (DefiningMI->getOperand(2*i+2).isImm()) {

      switch (DefiningMI->getOperand(2*i+2).getImm()) {

      default:

        return false;


      case AArch64::dsub0:

      case AArch64::dsub1:

      case AArch64::dsub2:

      case AArch64::dsub3:

      case AArch64::qsub0:

      case AArch64::qsub1:

      case AArch64::qsub2:

      case AArch64::qsub3:

        break;

      }

    }

    else

      return false;

  }

  return true;

}


/// Return the number of useful source registers for this instruction

/// (2 for ST2 and 4 for ST4).

unsigned AArch64SIMDInstrOptImpl::determineSrcReg(MachineInstr &MI) const {

  switch (MI.getOpcode()) {

  default:

    llvm_unreachable("Unsupported instruction for this pass");


  case AArch64::ST2Twov16b:

  case AArch64::ST2Twov8b:

  case AArch64::ST2Twov8h:

  case AArch64::ST2Twov4h:

  case AArch64::ST2Twov4s:

  case AArch64::ST2Twov2s:

  case AArch64::ST2Twov2d:

    return 2;


  case AArch64::ST4Fourv16b:

  case AArch64::ST4Fourv8b:

  case AArch64::ST4Fourv8h:

  case AArch64::ST4Fourv4h:

  case AArch64::ST4Fourv4s:

  case AArch64::ST4Fourv2s:

  case AArch64::ST4Fourv2d:

    return 4;

  }

}


bool AArch64SIMDInstrOptImpl::run(MachineFunction &MF) {

  MRI = &MF.getRegInfo();

  const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();

  TII = ST.getInstrInfo();

  SchedModel.init(&ST);

  if (!SchedModel.hasInstrSchedModel())

    return false;


  bool Changed = false;

  for (auto OptimizationKind : {VectorElem, Interleave}) {

    if (!shouldExitEarly(&MF, OptimizationKind)) {

      SmallVector<MachineInstr *, 8> RemoveMIs;

      for (MachineBasicBlock &MBB : MF) {

        for (MachineInstr &MI : MBB) {

          bool InstRewrite;

          if (OptimizationKind == VectorElem)

            InstRewrite = optimizeVectElement(MI) ;

          else

            InstRewrite = optimizeLdStInterleave(MI);

          if (InstRewrite) {

            // Add MI to the list of instructions to be removed given that it

            // has been replaced.

            RemoveMIs.push_back(&MI);

            Changed = true;

          }

        }

      }

      for (MachineInstr *MI : RemoveMIs)

        MI->eraseFromParent();

    }

  }


  return Changed;

}


bool AArch64SIMDInstrOptLegacy::runOnMachineFunction(MachineFunction &MF) {

  if (skipFunction(MF.getFunction()))

    return false;


  return AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);

}


PreservedAnalyses


AArch64SIMDInstrOptPass::run(MachineFunction &MF,

                             MachineFunctionAnalysisManager &MFAM) {

  const bool Changed =

      AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);

  if (!Changed)

    return PreservedAnalyses::all();


  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();

  PA.preserveSet<CFGAnalyses>();

  return PA;

}


/// Returns an instance of the high cost ASIMD instruction replacement

/// optimization pass.


FunctionPass *llvm::createAArch64SIMDInstrOptPass() {

  return new AArch64SIMDInstrOptLegacy();

}


AArch64InstrInfo.h

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

RuleST4
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
Definition AArch64SIMDInstrOpt.cpp:110

RuleST2
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
Definition AArch64SIMDInstrOpt.cpp:108

AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
Definition AArch64SIMDInstrOpt.cpp:64

AArch64Subtarget.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

MCInstrDesc.h

MCSchedule.h

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineBasicBlock.h

MachineFunctionAnalysisManager.h

MachineFunctionPass.h

MachineFunction.h

MachineInstrBuilder.h

MachineInstr.h

MachineOperand.h

MachineRegisterInfo.h

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56

Pass.h

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

StringRef.h

TargetInstrInfo.h

TargetSchedule.h

TargetSubtargetInfo.h

bool

llvm::AArch64InstrInfo
Definition AArch64InstrInfo.h:180

llvm::AArch64SIMDInstrOptPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition AArch64SIMDInstrOpt.cpp:764

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition MCInstrDesc.h:199

llvm::MCInstrDesc::getOpcode
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition MCInstrDesc.h:231

llvm::MCSubtargetInfo::getCPU
StringRef getCPU() const
Definition MCSubtargetInfo.h:112

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:330

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:348

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition MachineFunctionPass.h:31

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:788

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:199

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:233

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:601

llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition MachineInstr.h:604

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:609

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::isKill
bool isKill() const
Definition MachineOperand.h:402

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:333

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::getUniqueVRegDef
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition MachineRegisterInfo.cpp:417

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorImpl::clear
void clear()
Definition SmallVector.h:618

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition TargetSchedule.h:31

llvm::TargetSchedModel::hasInstrSchedModel
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
Definition TargetSchedule.cpp:35

llvm::TargetSchedModel::init
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
Definition TargetSchedule.cpp:43

llvm::TargetSchedModel::getSubtargetInfo
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
Definition TargetSchedule.h:75

Changed
Changed
Definition ObjCARCOpts.cpp:2366

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::DXILDebugInfoPass::run
DXILDebugInfoMap run(Module &M)
Definition DXILDebugInfo.cpp:66

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:449

llvm::RegState
RegState
Flags to represent properties of register accesses.
Definition MachineInstrBuilder.h:50

llvm::getKillRegState
constexpr RegState getKillRegState(bool B)
Definition MachineInstrBuilder.h:90

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::getMachineFunctionPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition MachinePassManager.cpp:162

llvm::createAArch64SIMDInstrOptPass
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
Definition AArch64SIMDInstrOpt.cpp:778

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::MCSchedClassDesc
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129

llvm::MCSchedClassDesc::isValid
bool isValid() const
Definition MCSchedule.h:147

llvm::MCSchedClassDesc::isVariant
bool isVariant() const
Definition MCSchedule.h:150