doxygen/ARMLatencyMutations_8cpp_source.html

//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This file contains the ARM definition DAG scheduling mutations which

/// change inter-instruction latencies

//

//===----------------------------------------------------------------------===//


#include "ARMLatencyMutations.h"

#include "ARMSubtarget.h"

#include "Thumb2InstrInfo.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/CodeGen/ScheduleDAG.h"

#include "llvm/CodeGen/ScheduleDAGMutation.h"

#include "llvm/CodeGen/TargetInstrInfo.h"

#include <algorithm>

#include <array>

#include <initializer_list>

#include <memory>


namespace llvm {


namespace {


// Precompute information about opcodes to speed up pass


class InstructionInformation {

protected:

  struct IInfo {

    bool HasBRegAddr : 1;      // B-side of addr gen is a register

    bool HasBRegAddrShift : 1; // B-side of addr gen has a shift

    bool IsDivide : 1;         // Some form of integer divide

    bool IsInlineShiftALU : 1; // Inline shift+ALU

    bool IsMultiply : 1;       // Some form of integer multiply

    bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation

    bool IsNonSubwordLoad : 1; // Load which is a word or larger

    bool IsShift : 1;          // Shift operation

    bool IsRev : 1;            // REV operation

    bool ProducesQP : 1;       // Produces a vector register result

    bool ProducesDP : 1;       // Produces a double-precision register result

    bool ProducesSP : 1;       // Produces a single-precision register result

    bool ConsumesQP : 1;       // Consumes a vector register result

    bool ConsumesDP : 1;       // Consumes a double-precision register result

    bool ConsumesSP : 1;       // Consumes a single-precision register result

    unsigned MVEIntMACMatched; // Matched operand type (for MVE)

    unsigned AddressOpMask;    // Mask indicating which operands go into AGU

    IInfo()

        : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),

          IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),

          IsNonSubwordLoad(false), IsShift(false), IsRev(false),

          ProducesQP(false), ProducesDP(false), ProducesSP(false),

          ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),

          MVEIntMACMatched(0), AddressOpMask(0) {}

  };

  typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;

  IInfoArray Info;


public:

  // Always available information

  unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }

  bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }

  bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }

  bool isDivide(unsigned Op) { return Info[Op].IsDivide; }

  bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }

  bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }

  bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }

  bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }

  bool isRev(unsigned Op) { return Info[Op].IsRev; }

  bool isShift(unsigned Op) { return Info[Op].IsShift; }


  // information available if markDPConsumers is called.

  bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }

  bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }

  bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }

  bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }

  bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }

  bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }


  bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {

    return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;

  }


  InstructionInformation(const ARMBaseInstrInfo *TII);


protected:

  void markDPProducersConsumers(const ARMBaseInstrInfo *TII);

};


InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {

  using namespace ARM;


  std::initializer_list<unsigned> hasBRegAddrList = {

      t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,

      tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr,

  };

  for (auto op : hasBRegAddrList) {

    Info[op].HasBRegAddr = true;

  }


  std::initializer_list<unsigned> hasBRegAddrShiftList = {

      t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,

  };

  for (auto op : hasBRegAddrShiftList) {

    Info[op].HasBRegAddrShift = true;

  }


  Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;


  std::initializer_list<unsigned> isInlineShiftALUList = {

      t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs,

      t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs,

      t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs,

  };

  for (auto op : isInlineShiftALUList) {

    Info[op].IsInlineShiftALU = true;

  }


  Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;


  std::initializer_list<unsigned> isMultiplyList = {

      t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX,

      t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,

      t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX,

      t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD,

      t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT,

      t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL,

  };

  for (auto op : isMultiplyList) {

    Info[op].IsMultiply = true;

  }


  std::initializer_list<unsigned> isMVEIntMACList = {

      MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8,

      MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8,

      MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8,

      MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8,

      MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8,

      MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,

      MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8,

      MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8,

      MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8,

      MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8,

      MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8,

      MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8,

      MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8,

      MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8,

  };

  for (auto op : isMVEIntMACList) {

    Info[op].IsMVEIntMAC = true;

  }


  std::initializer_list<unsigned> isNonSubwordLoadList = {

      t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci,

      t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,

      tLDRpci,  tLDRr,    tLDRspi,

  };

  for (auto op : isNonSubwordLoadList) {

    Info[op].IsNonSubwordLoad = true;

  }


  std::initializer_list<unsigned> isRevList = {

      t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,

  };

  for (auto op : isRevList) {

    Info[op].IsRev = true;

  }


  std::initializer_list<unsigned> isShiftList = {

      t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,

      tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR,

  };

  for (auto op : isShiftList) {

    Info[op].IsShift = true;

  }


  std::initializer_list<unsigned> Address1List = {

      t2LDRBi12,

      t2LDRBi8,

      t2LDRBpci,

      t2LDRBs,

      t2LDRHi12,

      t2LDRHi8,

      t2LDRHpci,

      t2LDRHs,

      t2LDRSBi12,

      t2LDRSBi8,

      t2LDRSBpci,

      t2LDRSBs,

      t2LDRSHi12,

      t2LDRSHi8,

      t2LDRSHpci,

      t2LDRSHs,

      t2LDRi12,

      t2LDRi8,

      t2LDRpci,

      t2LDRs,

      tLDRBi,

      tLDRBr,

      tLDRHi,

      tLDRHr,

      tLDRSB,

      tLDRSH,

      tLDRi,

      tLDRpci,

      tLDRr,

      tLDRspi,

      t2STRBi12,

      t2STRBi8,

      t2STRBs,

      t2STRHi12,

      t2STRHi8,

      t2STRHs,

      t2STRi12,

      t2STRi8,

      t2STRs,

      tSTRBi,

      tSTRBr,

      tSTRHi,

      tSTRHr,

      tSTRi,

      tSTRr,

      tSTRspi,

      VLDRD,

      VLDRH,

      VLDRS,

      VSTRD,

      VSTRH,

      VSTRS,

      MVE_VLD20_16,

      MVE_VLD20_32,

      MVE_VLD20_8,

      MVE_VLD21_16,

      MVE_VLD21_32,

      MVE_VLD21_8,

      MVE_VLD40_16,

      MVE_VLD40_32,

      MVE_VLD40_8,

      MVE_VLD41_16,

      MVE_VLD41_32,

      MVE_VLD41_8,

      MVE_VLD42_16,

      MVE_VLD42_32,

      MVE_VLD42_8,

      MVE_VLD43_16,

      MVE_VLD43_32,

      MVE_VLD43_8,

      MVE_VLDRBS16,

      MVE_VLDRBS16_rq,

      MVE_VLDRBS32,

      MVE_VLDRBS32_rq,

      MVE_VLDRBU16,

      MVE_VLDRBU16_rq,

      MVE_VLDRBU32,

      MVE_VLDRBU32_rq,

      MVE_VLDRBU8,

      MVE_VLDRBU8_rq,

      MVE_VLDRDU64_qi,

      MVE_VLDRDU64_rq,

      MVE_VLDRDU64_rq_u,

      MVE_VLDRHS32,

      MVE_VLDRHS32_rq,

      MVE_VLDRHS32_rq_u,

      MVE_VLDRHU16,

      MVE_VLDRHU16_rq,

      MVE_VLDRHU16_rq_u,

      MVE_VLDRHU32,

      MVE_VLDRHU32_rq,

      MVE_VLDRHU32_rq_u,

      MVE_VLDRWU32,

      MVE_VLDRWU32_qi,

      MVE_VLDRWU32_rq,

      MVE_VLDRWU32_rq_u,

      MVE_VST20_16,

      MVE_VST20_32,

      MVE_VST20_8,

      MVE_VST21_16,

      MVE_VST21_32,

      MVE_VST21_8,

      MVE_VST40_16,

      MVE_VST40_32,

      MVE_VST40_8,

      MVE_VST41_16,

      MVE_VST41_32,

      MVE_VST41_8,

      MVE_VST42_16,

      MVE_VST42_32,

      MVE_VST42_8,

      MVE_VST43_16,

      MVE_VST43_32,

      MVE_VST43_8,

      MVE_VSTRB16,

      MVE_VSTRB16_rq,

      MVE_VSTRB32,

      MVE_VSTRB32_rq,

      MVE_VSTRBU8,

      MVE_VSTRB8_rq,

      MVE_VSTRD64_qi,

      MVE_VSTRD64_rq,

      MVE_VSTRD64_rq_u,

      MVE_VSTRH32,

      MVE_VSTRH32_rq,

      MVE_VSTRH32_rq_u,

      MVE_VSTRHU16,

      MVE_VSTRH16_rq,

      MVE_VSTRH16_rq_u,

      MVE_VSTRWU32,

      MVE_VSTRW32_qi,

      MVE_VSTRW32_rq,

      MVE_VSTRW32_rq_u,

  };

  std::initializer_list<unsigned> Address2List = {

      t2LDRB_POST,

      t2LDRB_PRE,

      t2LDRDi8,

      t2LDRH_POST,

      t2LDRH_PRE,

      t2LDRSB_POST,

      t2LDRSB_PRE,

      t2LDRSH_POST,

      t2LDRSH_PRE,

      t2LDR_POST,

      t2LDR_PRE,

      t2STRB_POST,

      t2STRB_PRE,

      t2STRDi8,

      t2STRH_POST,

      t2STRH_PRE,

      t2STR_POST,

      t2STR_PRE,

      MVE_VLD20_16_wb,

      MVE_VLD20_32_wb,

      MVE_VLD20_8_wb,

      MVE_VLD21_16_wb,

      MVE_VLD21_32_wb,

      MVE_VLD21_8_wb,

      MVE_VLD40_16_wb,

      MVE_VLD40_32_wb,

      MVE_VLD40_8_wb,

      MVE_VLD41_16_wb,

      MVE_VLD41_32_wb,

      MVE_VLD41_8_wb,

      MVE_VLD42_16_wb,

      MVE_VLD42_32_wb,

      MVE_VLD42_8_wb,

      MVE_VLD43_16_wb,

      MVE_VLD43_32_wb,

      MVE_VLD43_8_wb,

      MVE_VLDRBS16_post,

      MVE_VLDRBS16_pre,

      MVE_VLDRBS32_post,

      MVE_VLDRBS32_pre,

      MVE_VLDRBU16_post,

      MVE_VLDRBU16_pre,

      MVE_VLDRBU32_post,

      MVE_VLDRBU32_pre,

      MVE_VLDRBU8_post,

      MVE_VLDRBU8_pre,

      MVE_VLDRDU64_qi_pre,

      MVE_VLDRHS32_post,

      MVE_VLDRHS32_pre,

      MVE_VLDRHU16_post,

      MVE_VLDRHU16_pre,

      MVE_VLDRHU32_post,

      MVE_VLDRHU32_pre,

      MVE_VLDRWU32_post,

      MVE_VLDRWU32_pre,

      MVE_VLDRWU32_qi_pre,

      MVE_VST20_16_wb,

      MVE_VST20_32_wb,

      MVE_VST20_8_wb,

      MVE_VST21_16_wb,

      MVE_VST21_32_wb,

      MVE_VST21_8_wb,

      MVE_VST40_16_wb,

      MVE_VST40_32_wb,

      MVE_VST40_8_wb,

      MVE_VST41_16_wb,

      MVE_VST41_32_wb,

      MVE_VST41_8_wb,

      MVE_VST42_16_wb,

      MVE_VST42_32_wb,

      MVE_VST42_8_wb,

      MVE_VST43_16_wb,

      MVE_VST43_32_wb,

      MVE_VST43_8_wb,

      MVE_VSTRB16_post,

      MVE_VSTRB16_pre,

      MVE_VSTRB32_post,

      MVE_VSTRB32_pre,

      MVE_VSTRBU8_post,

      MVE_VSTRBU8_pre,

      MVE_VSTRD64_qi_pre,

      MVE_VSTRH32_post,

      MVE_VSTRH32_pre,

      MVE_VSTRHU16_post,

      MVE_VSTRHU16_pre,

      MVE_VSTRWU32_post,

      MVE_VSTRWU32_pre,

      MVE_VSTRW32_qi_pre,

  };

  std::initializer_list<unsigned> Address3List = {

      t2LDRD_POST,

      t2LDRD_PRE,

      t2STRD_POST,

      t2STRD_PRE,

  };

  // Compute a mask of which operands are involved in address computation

  for (auto &op : Address1List) {

    Info[op].AddressOpMask = 0x6;

  }

  for (auto &op : Address2List) {

    Info[op].AddressOpMask = 0xc;

  }

  for (auto &op : Address3List) {

    Info[op].AddressOpMask = 0x18;

  }

  for (auto &op : hasBRegAddrShiftList) {

    Info[op].AddressOpMask |= 0x8;

  }

}


void InstructionInformation::markDPProducersConsumers(

    const ARMBaseInstrInfo *TII) {

  // Learn about all instructions which have FP source/dest registers

  for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {

    const MCInstrDesc &MID = TII->get(MI);

    auto Operands = MID.operands();

    for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {

      bool MarkQP = false, MarkDP = false, MarkSP = false;

      switch (Operands[OI].RegClass) {

      case ARM::MQPRRegClassID:

      case ARM::DPRRegClassID:

      case ARM::DPR_8RegClassID:

      case ARM::DPR_VFP2RegClassID:

      case ARM::DPairRegClassID:

      case ARM::DPairSpcRegClassID:

      case ARM::DQuadRegClassID:

      case ARM::DQuadSpcRegClassID:

      case ARM::DTripleRegClassID:

      case ARM::DTripleSpcRegClassID:

        MarkDP = true;

        break;

      case ARM::QPRRegClassID:

      case ARM::QPR_8RegClassID:

      case ARM::QPR_VFP2RegClassID:

      case ARM::QQPRRegClassID:

      case ARM::QQQQPRRegClassID:

        MarkQP = true;

        break;

      case ARM::SPRRegClassID:

      case ARM::SPR_8RegClassID:

      case ARM::FPWithVPRRegClassID:

        MarkSP = true;

        break;

      default:

        break;

      }

      if (MarkQP) {

        if (OI < MID.getNumDefs())

          Info[MI].ProducesQP = true;

        else

          Info[MI].ConsumesQP = true;

      }

      if (MarkDP) {

        if (OI < MID.getNumDefs())

          Info[MI].ProducesDP = true;

        else

          Info[MI].ConsumesDP = true;

      }

      if (MarkSP) {

        if (OI < MID.getNumDefs())

          Info[MI].ProducesSP = true;

        else

          Info[MI].ConsumesSP = true;

      }

    }

  }

}


} // anonymous namespace


static bool hasImplicitCPSRUse(const MachineInstr *MI) {

  return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);

}


void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,

                                            unsigned latency) {

  SDep Reverse = SrcDep;

  Reverse.setSUnit(&SrcSU);

  for (SDep &PDep : SrcDep.getSUnit()->Preds) {

    if (PDep == Reverse) {

      PDep.setLatency(latency);

      SrcDep.getSUnit()->setDepthDirty();

      break;

    }

  }

  SrcDep.setLatency(latency);

  SrcSU.setHeightDirty();

}


static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {

  return (a & 0xe) != (b & 0xe);

}


// Set output dependences to zero latency for processors which can

// simultaneously issue to the same register.  Returns true if a change

// was made.

bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {

  if (Dep.getKind() == SDep::Output) {

    setBidirLatencies(ISU, Dep, 0);

    return true;

  }

  return false;

}


// The graph doesn't look inside of bundles to determine their

// scheduling boundaries and reports zero latency into and out of them

// (except for CPSR into the bundle, which has latency 1).

// Make some better scheduling assumptions:

// 1) CPSR uses have zero latency; other uses have incoming latency 1

// 2) CPSR defs retain a latency of zero; others have a latency of 1.

//

// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise

unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {


  SUnit &DepSU = *Dep.getSUnit();

  const MachineInstr *SrcMI = ISU.getInstr();

  unsigned SrcOpcode = SrcMI->getOpcode();

  const MachineInstr *DstMI = DepSU.getInstr();

  unsigned DstOpcode = DstMI->getOpcode();


  if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {

    setBidirLatencies(

        ISU, Dep,

        (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);

    return 1;

  }

  if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&

      Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {

    setBidirLatencies(ISU, Dep, 1);

    return 2;

  }

  return 0;

}


// Determine whether there is a memory RAW hazard here and set up latency

// accordingly

bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,

                                          unsigned latency) {

  if (!Dep.isNormalMemory())

    return false;

  auto &SrcInst = *ISU.getInstr();

  auto &DstInst = *Dep.getSUnit()->getInstr();

  if (!SrcInst.mayStore() || !DstInst.mayLoad())

    return false;


  auto SrcMO = *SrcInst.memoperands().begin();

  auto DstMO = *DstInst.memoperands().begin();

  auto SrcVal = SrcMO->getValue();

  auto DstVal = DstMO->getValue();

  auto SrcPseudoVal = SrcMO->getPseudoValue();

  auto DstPseudoVal = DstMO->getPseudoValue();

  if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&

      SrcMO->getOffset() == DstMO->getOffset()) {

    setBidirLatencies(ISU, Dep, latency);

    return true;

  } else if (SrcPseudoVal && DstPseudoVal &&

             SrcPseudoVal->kind() == DstPseudoVal->kind() &&

             SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {

    // Spills/fills

    auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);

    auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);

    if (FS0 == FS1) {

      setBidirLatencies(ISU, Dep, latency);

      return true;

    }

  }

  return false;

}


namespace {


std::unique_ptr<InstructionInformation> II;


class CortexM7InstructionInformation : public InstructionInformation {

public:

  CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)

      : InstructionInformation(TII) {}

};


class CortexM7Overrides : public ARMOverrideBypasses {

public:

  CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)

      : ARMOverrideBypasses(TII, AA) {

    if (!II)

      II.reset(new CortexM7InstructionInformation(TII));

  }


  void modifyBypasses(SUnit &) override;

};


void CortexM7Overrides::modifyBypasses(SUnit &ISU) {

  const MachineInstr *SrcMI = ISU.getInstr();

  unsigned SrcOpcode = SrcMI->getOpcode();

  bool isNSWload = II->isNonSubwordLoad(SrcOpcode);


  // Walk the successors looking for latency overrides that are needed

  for (SDep &Dep : ISU.Succs) {


    // Output dependences should have 0 latency, as M7 is able to

    // schedule writers to the same register for simultaneous issue.

    if (zeroOutputDependences(ISU, Dep))

      continue;


    if (memoryRAWHazard(ISU, Dep, 4))

      continue;


    // Ignore dependencies other than data

    if (Dep.getKind() != SDep::Data)

      continue;


    SUnit &DepSU = *Dep.getSUnit();

    if (DepSU.isBoundaryNode())

      continue;


    if (makeBundleAssumptions(ISU, Dep) == 1)

      continue;


    const MachineInstr *DstMI = DepSU.getInstr();

    unsigned DstOpcode = DstMI->getOpcode();


    // Word loads into any multiply or divide instruction are considered

    // cannot bypass their scheduling stage. Didn't do this in the .td file

    // because we cannot easily create a read advance that is 0 from certain

    // writer classes and 1 from all the rest.

    // (The other way around would have been easy.)

    if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))

      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);


    // Word loads into B operand of a load/store are considered cannot bypass

    // their scheduling stage. Cannot do in the .td file because

    // need to decide between -1 and -2 for ReadAdvance

    if (isNSWload && II->hasBRegAddr(DstOpcode) &&

        DstMI->getOperand(2).getReg() == Dep.getReg())

      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);


    // Multiplies into any address generation cannot bypass from EX3.  Cannot do

    // in the .td file because need to decide between -1 and -2 for ReadAdvance

    if (II->isMultiply(SrcOpcode)) {

      unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;

      for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {

        if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&

            DstMI->getOperand(i).getReg() == Dep.getReg()) {

          setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1

          break;

        }

      }

    }


    // Mismatched conditional producers take longer on M7; they end up looking

    // like they were produced at EX3 and read at IS.

    if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&

        (SrcOpcode == ARM::BUNDLE ||

         mismatchedPred(TII->getPredicate(*SrcMI),

                        TII->getPredicate(*DstMI)))) {

      unsigned Lat = 1;

      // Operand A of shift+ALU is treated as an EX1 read instead of EX2.

      if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&

          DstMI->getOperand(1).getReg() == Dep.getReg())

        Lat = 2;

      Lat = std::min(3u, Dep.getLatency() + Lat);

      setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));

    }


    // CC setter into conditional producer shouldn't have a latency of more

    // than 1 unless it's due to an implicit read. (All the "true" readers

    // of the condition code use an implicit read, and predicates use an

    // explicit.)

    if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&

        TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))

      setBidirLatencies(ISU, Dep, 1);


    // REV instructions cannot bypass directly into the EX1 shifter.  The

    // code is slightly inexact as it doesn't attempt to ensure that the bypass

    // is to the shifter operands.

    if (II->isRev(SrcOpcode)) {

      if (II->isInlineShiftALU(DstOpcode))

        setBidirLatencies(ISU, Dep, 2);

      else if (II->isShift(DstOpcode))

        setBidirLatencies(ISU, Dep, 1);

    }

  }

}


class M85InstructionInformation : public InstructionInformation {

public:

  M85InstructionInformation(const ARMBaseInstrInfo *t)

      : InstructionInformation(t) {

    markDPProducersConsumers(t);

  }

};


class M85Overrides : public ARMOverrideBypasses {

public:

  M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)

      : ARMOverrideBypasses(t, a) {

    if (!II)

      II.reset(new M85InstructionInformation(t));

  }


  void modifyBypasses(SUnit &) override;


private:

  unsigned computeBypassStage(const MCSchedClassDesc *SCD);

  signed modifyMixedWidthFP(const MachineInstr *SrcMI,

                            const MachineInstr *DstMI, unsigned RegID,

                            const MCSchedClassDesc *SCD);

};


unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {

  auto SM = DAG->getSchedModel();

  unsigned DefIdx = 0; // just look for the first output's timing

  if (DefIdx < SCDesc->NumWriteLatencyEntries) {

    // Lookup the definition's write latency in SubtargetInfo.

    const MCWriteLatencyEntry *WLEntry =

        SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);

    unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;

    if (Latency == 4)

      return 2;

    else if (Latency == 5)

      return 3;

    else if (Latency > 3)

      return 3;

    else

      return Latency;

  }

  return 2;

}


// Latency changes for bypassing between FP registers of different sizes:

//

// Note that mixed DP/SP are unlikely because of the semantics

// of C.  Mixed MVE/SP are quite common when MVE intrinsics are used.

signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,

                                        const MachineInstr *DstMI,

                                        unsigned RegID,

                                        const MCSchedClassDesc *SCD) {


  if (!II->producesSP(SrcMI->getOpcode()) &&

      !II->producesDP(SrcMI->getOpcode()) &&

      !II->producesQP(SrcMI->getOpcode()))

    return 0;


  if (Register::isVirtualRegister(RegID)) {

    if (II->producesSP(SrcMI->getOpcode()) &&

        II->consumesDP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&

            OP.getSubReg() == ARM::ssub_1)

          return 5 - computeBypassStage(SCD);

    } else if (II->producesSP(SrcMI->getOpcode()) &&

               II->consumesQP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&

            (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))

          return 5 - computeBypassStage(SCD) -

                 ((OP.getSubReg() == ARM::ssub_2 ||

                   OP.getSubReg() == ARM::ssub_3)

                      ? 1

                      : 0);

    } else if (II->producesDP(SrcMI->getOpcode()) &&

               II->consumesQP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&

            OP.getSubReg() == ARM::ssub_1)

          return -1;

    } else if (II->producesDP(SrcMI->getOpcode()) &&

               II->consumesSP(DstMI->getOpcode())) {

      for (auto &OP : DstMI->operands())

        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&

            OP.getSubReg() == ARM::ssub_1)

          return 5 - computeBypassStage(SCD);

    } else if (II->producesQP(SrcMI->getOpcode()) &&

               II->consumesSP(DstMI->getOpcode())) {

      for (auto &OP : DstMI->operands())

        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&

            (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))

          return 5 - computeBypassStage(SCD) +

                 ((OP.getSubReg() == ARM::ssub_2 ||

                   OP.getSubReg() == ARM::ssub_3)

                      ? 1

                      : 0);

    } else if (II->producesQP(SrcMI->getOpcode()) &&

               II->consumesDP(DstMI->getOpcode())) {

      for (auto &OP : DstMI->operands())

        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&

            OP.getSubReg() == ARM::ssub_1)

          return 1;

    }

  } else if (Register::isPhysicalRegister(RegID)) {

    // Note that when the producer is narrower, not all of the producers

    // may be present in the scheduling graph; somewhere earlier in the

    // compiler, an implicit def/use of the aliased full register gets

    // added to the producer, and so only that producer is seen as *the*

    // single producer.  This behavior also has the unfortunate effect of

    // serializing the producers in the compiler's view of things.

    if (II->producesSP(SrcMI->getOpcode()) &&

        II->consumesDP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&

            OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&

            (OP.getReg() == RegID ||

             (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||

             (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))

          return 5 - computeBypassStage(SCD);

    } else if (II->producesSP(SrcMI->getOpcode()) &&

               II->consumesQP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&

            OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&

            (OP.getReg() == RegID ||

             (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||

             (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))

          return 5 - computeBypassStage(SCD) -

                 (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);

    } else if (II->producesDP(SrcMI->getOpcode()) &&

               II->consumesQP(DstMI->getOpcode())) {

      for (auto &OP : SrcMI->operands())

        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&

            OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&

            (OP.getReg() == RegID ||

             (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))

          return -1;

    } else if (II->producesDP(SrcMI->getOpcode()) &&

               II->consumesSP(DstMI->getOpcode())) {

      if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)

        return 5 - computeBypassStage(SCD);

    } else if (II->producesQP(SrcMI->getOpcode()) &&

               II->consumesSP(DstMI->getOpcode())) {

      if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)

        return 5 - computeBypassStage(SCD) +

               (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);

    } else if (II->producesQP(SrcMI->getOpcode()) &&

               II->consumesDP(DstMI->getOpcode())) {

      if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)

        return 1;

    }

  }

  return 0;

}


void M85Overrides::modifyBypasses(SUnit &ISU) {

  const MachineInstr *SrcMI = ISU.getInstr();

  unsigned SrcOpcode = SrcMI->getOpcode();

  bool isNSWload = II->isNonSubwordLoad(SrcOpcode);


  // Walk the successors looking for latency overrides that are needed

  for (SDep &Dep : ISU.Succs) {


    // Output dependences should have 0 latency, as CortexM85 is able to

    // schedule writers to the same register for simultaneous issue.

    if (zeroOutputDependences(ISU, Dep))

      continue;


    if (memoryRAWHazard(ISU, Dep, 3))

      continue;


    // Ignore dependencies other than data or strong ordering.

    if (Dep.getKind() != SDep::Data)

      continue;


    SUnit &DepSU = *Dep.getSUnit();

    if (DepSU.isBoundaryNode())

      continue;


    if (makeBundleAssumptions(ISU, Dep) == 1)

      continue;


    const MachineInstr *DstMI = DepSU.getInstr();

    unsigned DstOpcode = DstMI->getOpcode();


    // Word loads into B operand of a load/store with cannot bypass their

    // scheduling stage. Cannot do in the .td file because need to decide

    // between -1 and -2 for ReadAdvance


    if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&

        DstMI->getOperand(3).getImm() != 0 && // shift operand

        DstMI->getOperand(2).getReg() == Dep.getReg())

      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);


    if (isNSWload && isMVEVectorInstruction(DstMI)) {

      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);

    }


    if (II->isMVEIntMAC(DstOpcode) &&

        II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&

        DstMI->getOperand(0).isReg() &&

        DstMI->getOperand(0).getReg() == Dep.getReg())

      setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);


    // CC setter into conditional producer shouldn't have a latency of more

    // than 0 unless it's due to an implicit read.

    if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&

        TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))

      setBidirLatencies(ISU, Dep, 0);


    if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),

                                         DAG->getSchedClass(&ISU)))

      setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));


    if (II->isRev(SrcOpcode)) {

      if (II->isInlineShiftALU(DstOpcode))

        setBidirLatencies(ISU, Dep, 1);

      else if (II->isShift(DstOpcode))

        setBidirLatencies(ISU, Dep, 1);

    }

  }

}


// Add M55 specific overrides for latencies between instructions. Currently it:

//  - Adds an extra cycle latency between MVE VMLAV and scalar instructions.

class CortexM55Overrides : public ARMOverrideBypasses {

public:

  CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)

      : ARMOverrideBypasses(TII, AA) {}


  void modifyBypasses(SUnit &SU) override {

    MachineInstr *SrcMI = SU.getInstr();

    if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))

      return;


    for (SDep &Dep : SU.Succs) {

      if (Dep.getKind() != SDep::Data)

        continue;

      SUnit &DepSU = *Dep.getSUnit();

      if (DepSU.isBoundaryNode())

        continue;

      MachineInstr *DstMI = DepSU.getInstr();


      if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())

        setBidirLatencies(SU, Dep, 3);

    }

  }

};


} // end anonymous namespace


void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {

  DAG = DAGInstrs;

  for (SUnit &ISU : DAGInstrs->SUnits) {

    if (ISU.isBoundaryNode())

      continue;

    modifyBypasses(ISU);

  }

  if (DAGInstrs->ExitSU.getInstr())

    modifyBypasses(DAGInstrs->ExitSU);

}


std::unique_ptr<ScheduleDAGMutation>

createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {

  if (ST.isCortexM85())

    return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);

  else if (ST.isCortexM7())

    return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);

  else if (ST.isCortexM55())

    return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);


  return nullptr;

}


} // end namespace llvm

ARMLatencyMutations.h

ARMSubtarget.h

AliasAnalysis.h

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27

op
#define op(i)

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:112

Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74

II
uint64_t IntrinsicInst * II
Definition: NVVMIntrRange.cpp:51

OP
#define OP(OPC)
Definition: Instruction.h:45

ScheduleDAGMutation.h

ScheduleDAG.h

TargetInstrInfo.h

Thumb2InstrInfo.h

llvm::AAResults
Definition: AliasAnalysis.h:314

llvm::AAResults::alias
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
The main low level interface to the alias analysis implementation.
Definition: AliasAnalysis.cpp:105

llvm::ARMBaseInstrInfo
Definition: ARMBaseInstrInfo.h:42

llvm::ARMBaseInstrInfo::isPredicated
bool isPredicated(const MachineInstr &MI) const override
Definition: ARMBaseInstrInfo.cpp:559

llvm::ARMOverrideBypasses::memoryRAWHazard
bool memoryRAWHazard(SUnit &ISU, SDep &Dep, unsigned latency)
Definition: ARMLatencyMutations.cpp:553

llvm::ARMOverrideBypasses::setBidirLatencies
static void setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, unsigned latency)
Definition: ARMLatencyMutations.cpp:491

llvm::ARMOverrideBypasses::DAG
ScheduleDAGInstrs * DAG
Definition: ARMLatencyMutations.h:40

llvm::ARMOverrideBypasses::AA
AAResults * AA
Definition: ARMLatencyMutations.h:39

llvm::ARMOverrideBypasses::zeroOutputDependences
static bool zeroOutputDependences(SUnit &ISU, SDep &Dep)
Definition: ARMLatencyMutations.cpp:513

llvm::ARMOverrideBypasses::apply
void apply(ScheduleDAGInstrs *DAGInstrs) override
Definition: ARMLatencyMutations.cpp:953

llvm::ARMOverrideBypasses::makeBundleAssumptions
unsigned makeBundleAssumptions(SUnit &ISU, SDep &Dep)
Definition: ARMLatencyMutations.cpp:529

llvm::ARMOverrideBypasses::TII
const ARMBaseInstrInfo * TII
Definition: ARMLatencyMutations.h:38

llvm::ARMSubtarget
Definition: ARMSubtarget.h:48

llvm::AliasResult::MustAlias
@ MustAlias
The two locations precisely alias each other.
Definition: AliasAnalysis.h:102

llvm::HexagonInstrInfo::isPredicated
bool isPredicated(const MachineInstr &MI) const override
Returns true if the instruction is already predicated.
Definition: HexagonInstrInfo.cpp:1666

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:69

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575

llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:788

llvm::PseudoSourceValue::FixedStack
@ FixedStack
Definition: PseudoSourceValue.h:38

llvm::Register::isVirtualRegister
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:71

llvm::Register::isPhysicalRegister
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65

llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49

llvm::SDep::getSUnit
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498

llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504

llvm::SDep::Output
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55

llvm::SDep::Data
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147

llvm::SDep::isAssignedRegDep
bool isAssignedRegDep() const
Tests if this is a Data dependence that is associated with a register.
Definition: ScheduleDAG.h:211

llvm::SDep::isNormalMemory
bool isNormalMemory() const
Tests if this is an Order dependence between two memory accesses where both sides of the dependence a...
Definition: ScheduleDAG.h:168

llvm::SDep::getReg
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242

llvm::SUnit::setHeightDirty
void setHeightDirty()
Sets a flag in this node to indicate that its stored Height value will require recomputation the next...
Definition: ScheduleDAG.cpp:232

llvm::SUnit::isBoundaryNode
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358

llvm::SUnit::setDepthDirty
void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
Definition: ScheduleDAG.cpp:217

llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262

llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390

llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:114

llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579

llvm::ScheduleDAG::ExitSU
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581

false
Definition: StackSlotColoring.cpp:193

llvm::ARMCC::CondCodes
CondCodes
Definition: ARMBaseInfo.h:30

llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition: ARMBaseInfo.h:425

llvm::WinEH::EncodingType::ARM
@ ARM
Windows AXP64.

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::TailFoldingOpts::Reverse
@ Reverse

llvm::Latency
@ Latency
Definition: SIMachineScheduler.h:34

llvm::createARMLatencyMutations
std::unique_ptr< ScheduleDAGMutation > createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA)
Definition: ARMLatencyMutations.cpp:965

llvm::isMVEVectorInstruction
bool isMVEVectorInstruction(const MachineInstr *MI)
Definition: ARMBaseInstrInfo.h:976

llvm::hasImplicitCPSRUse
static bool hasImplicitCPSRUse(const MachineInstr *MI)
Definition: ARMLatencyMutations.cpp:487

llvm::Op
DWARFExpression::Operation Op
Definition: DWARFExpression.cpp:22

llvm::mismatchedPred
static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b)
Definition: ARMLatencyMutations.cpp:506