LCOV - code coverage report
Current view: top level - lib/Target/AArch64 - AArch64VectorByElementOpt.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 153 155 98.7 %
Date: 2017-09-14 15:23:50 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file contains a pass that performs optimization for vector by element
      11             : // SIMD instructions.
      12             : //
      13             : // Certain SIMD instructions with vector element operand are not efficient.
      14             : // Rewrite them into SIMD instructions with vector operands. This rewrite
      15             : // is driven by the latency of the instructions.
      16             : //
      17             : // Example:
      18             : //    fmla v0.4s, v1.4s, v2.s[1]
      19             : //    is rewritten into
      20             : //    dup v3.4s, v2.s[1]
      21             : //    fmla v0.4s, v1.4s, v3.4s
      22             : //
      23             : //===----------------------------------------------------------------------===//
      24             : 
      25             : #include "AArch64InstrInfo.h"
      26             : #include "llvm/ADT/SmallVector.h"
      27             : #include "llvm/ADT/Statistic.h"
      28             : #include "llvm/ADT/StringRef.h"
      29             : #include "llvm/CodeGen/MachineBasicBlock.h"
      30             : #include "llvm/CodeGen/MachineFunction.h"
      31             : #include "llvm/CodeGen/MachineFunctionPass.h"
      32             : #include "llvm/CodeGen/MachineInstr.h"
      33             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      34             : #include "llvm/CodeGen/MachineOperand.h"
      35             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      36             : #include "llvm/CodeGen/TargetSchedule.h"
      37             : #include "llvm/MC/MCInstrDesc.h"
      38             : #include "llvm/MC/MCSchedule.h"
      39             : #include "llvm/Pass.h"
      40             : #include "llvm/Target/TargetInstrInfo.h"
      41             : #include "llvm/Target/TargetSubtargetInfo.h"
      42             : #include <map>
      43             : 
      44             : using namespace llvm;
      45             : 
      46             : #define DEBUG_TYPE "aarch64-vectorbyelement-opt"
      47             : 
      48             : STATISTIC(NumModifiedInstr,
      49             :           "Number of vector by element instructions modified");
      50             : 
      51             : #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
      52             :   "AArch64 vector by element instruction optimization pass"
      53             : 
      54             : namespace {
      55             : 
      56        2721 : struct AArch64VectorByElementOpt : public MachineFunctionPass {
      57             :   static char ID;
      58             : 
      59             :   const TargetInstrInfo *TII;
      60             :   MachineRegisterInfo *MRI;
      61             :   TargetSchedModel SchedModel;
      62             : 
      63         914 :   AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
      64         914 :     initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
      65         914 :   }
      66             : 
      67             :   /// Based only on latency of instructions, determine if it is cost efficient
      68             :   /// to replace the instruction InstDesc by the two instructions InstDescRep1
      69             :   /// and InstDescRep2.
      70             :   /// Return true if replacement is recommended.
      71             :   bool
      72             :   shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
      73             :                            const MCInstrDesc *InstDescRep1,
      74             :                            const MCInstrDesc *InstDescRep2,
      75             :                            std::map<unsigned, bool> &VecInstElemTable) const;
      76             : 
      77             :   /// Determine if we need to exit the vector by element instruction
      78             :   /// optimization pass early. This makes sure that Targets with no need
      79             :   /// for this optimization do not spent any compile time on this pass.
      80             :   /// This check is done by comparing the latency of an indexed FMLA
      81             :   /// instruction to the latency of the DUP + the latency of a vector
      82             :   /// FMLA instruction. We do not check on other related instructions such
      83             :   /// as FMLS as we assume that if the situation shows up for one
      84             :   /// instruction, then it is likely to show up for the related ones.
      85             :   /// Return true if early exit of the pass is recommended.
      86             :   bool earlyExitVectElement(MachineFunction *MF);
      87             : 
      88             :   /// Check whether an equivalent DUP instruction has already been
      89             :   /// created or not.
      90             :   /// Return true when the dup instruction already exists. In this case,
      91             :   /// DestReg will point to the destination of the already created DUP.
      92             :   bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
      93             :                 unsigned LaneNumber, unsigned *DestReg) const;
      94             : 
      95             :   /// Certain SIMD instructions with vector element operand are not efficient.
      96             :   /// Rewrite them into SIMD instructions with vector operands. This rewrite
      97             :   /// is driven by the latency of the instructions.
      98             :   /// Return true if the SIMD instruction is modified.
      99             :   bool optimizeVectElement(MachineInstr &MI,
     100             :                            std::map<unsigned, bool> *VecInstElemTable) const;
     101             : 
     102             :   bool runOnMachineFunction(MachineFunction &Fn) override;
     103             : 
     104         913 :   StringRef getPassName() const override {
     105         913 :     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
     106             :   }
     107             : };
     108             : 
     109             : char AArch64VectorByElementOpt::ID = 0;
     110             : 
     111             : } // end anonymous namespace
     112             : 
     113      315295 : INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
     114             :                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
     115             : 
     116             : /// Based only on latency of instructions, determine if it is cost efficient
     117             : /// to replace the instruction InstDesc by the two instructions InstDescRep1
     118             : /// and InstDescRep2. Note that it is assumed in this fuction that an
     119             : /// instruction of type InstDesc is always replaced by the same two
     120             : /// instructions as results are cached here.
     121             : /// Return true if replacement is recommended.
     122        1059 : bool AArch64VectorByElementOpt::shouldReplaceInstruction(
     123             :     MachineFunction *MF, const MCInstrDesc *InstDesc,
     124             :     const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
     125             :     std::map<unsigned, bool> &VecInstElemTable) const {
     126             :   // Check if replacment decision is alredy available in the cached table.
     127             :   // if so, return it.
     128        1061 :   if (!VecInstElemTable.empty() &&
     129           6 :       VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
     130           0 :     return VecInstElemTable[InstDesc->getOpcode()];
     131             : 
     132        1059 :   unsigned SCIdx = InstDesc->getSchedClass();
     133        1059 :   unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
     134        1059 :   unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
     135             :   const MCSchedClassDesc *SCDesc =
     136        2118 :       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
     137             :   const MCSchedClassDesc *SCDescRep1 =
     138        2118 :       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
     139             :   const MCSchedClassDesc *SCDescRep2 =
     140        2118 :       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
     141             : 
     142             :   // If a subtarget does not define resources for any of the instructions
     143             :   // of interest, then return false for no replacement.
     144        1059 :   if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
     145        2118 :       SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
     146             :       SCDescRep2->isVariant()) {
     147           0 :     VecInstElemTable[InstDesc->getOpcode()] = false;
     148             :     return false;
     149             :   }
     150             : 
     151        3177 :   if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
     152        3177 :       SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
     153        2118 :           SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
     154         992 :     VecInstElemTable[InstDesc->getOpcode()] = true;
     155             :     return true;
     156             :   }
     157        1126 :   VecInstElemTable[InstDesc->getOpcode()] = false;
     158             :   return false;
     159             : }
     160             : 
     161             : /// Determine if we need to exit the vector by element instruction
     162             : /// optimization pass early. This makes sure that Targets with no need
     163             : /// for this optimization do not spent any compile time on this pass.
     164             : /// This check is done by comparing the latency of an indexed FMLA
     165             : /// instruction to the latency of the DUP + the latency of a vector
     166             : /// FMLA instruction. We do not check on other related instructions such
     167             : /// as FMLS as we assume that if the situation shows up for one
     168             : /// instruction, then it is likely to show up for the related ones.
     169             : /// Return true if early exit of the pass is recommended.
     170        1010 : bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
     171        2020 :   std::map<unsigned, bool> VecInstElemTable;
     172        2020 :   const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
     173        2020 :   const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
     174        2020 :   const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
     175             : 
     176        1010 :   if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
     177             :                                 VecInstElemTable))
     178             :     return true;
     179             :   return false;
     180             : }
     181             : 
     182             : /// Check whether an equivalent DUP instruction has already been
     183             : /// created or not.
     184             : /// Return true when the dup instruction already exists. In this case,
     185             : /// DestReg will point to the destination of the already created DUP.
     186          49 : bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
     187             :                                          unsigned SrcReg, unsigned LaneNumber,
     188             :                                          unsigned *DestReg) const {
     189          98 :   for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
     190         215 :        MII != MIE;) {
     191         334 :     MII--;
     192         167 :     MachineInstr *CurrentMI = &*MII;
     193             : 
     194           2 :     if (CurrentMI->getOpcode() == DupOpcode &&
     195           2 :         CurrentMI->getNumOperands() == 3 &&
     196         171 :         CurrentMI->getOperand(1).getReg() == SrcReg &&
     197           2 :         CurrentMI->getOperand(2).getImm() == LaneNumber) {
     198           1 :       *DestReg = CurrentMI->getOperand(0).getReg();
     199           1 :       return true;
     200             :     }
     201             :   }
     202             : 
     203             :   return false;
     204             : }
     205             : 
     206             : /// Certain SIMD instructions with vector element operand are not efficient.
     207             : /// Rewrite them into SIMD instructions with vector operands. This rewrite
     208             : /// is driven by the latency of the instructions.
     209             : /// The instruction of concerns are for the time being fmla, fmls, fmul,
     210             : /// and fmulx and hence they are hardcoded.
     211             : ///
     212             : /// Example:
     213             : ///    fmla v0.4s, v1.4s, v2.s[1]
     214             : ///    is rewritten into
     215             : ///    dup v3.4s, v2.s[1]           // dup not necessary if redundant
     216             : ///    fmla v0.4s, v1.4s, v3.4s
     217             : /// Return true if the SIMD instruction is modified.
     218        4110 : bool AArch64VectorByElementOpt::optimizeVectElement(
     219             :     MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
     220             :   const MCInstrDesc *MulMCID, *DupMCID;
     221        4110 :   const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
     222             : 
     223        8220 :   switch (MI.getOpcode()) {
     224             :   default:
     225             :     return false;
     226             : 
     227             :   // 4X32 instructions
     228           6 :   case AArch64::FMLAv4i32_indexed:
     229          12 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     230          12 :     MulMCID = &TII->get(AArch64::FMLAv4f32);
     231           6 :     break;
     232           6 :   case AArch64::FMLSv4i32_indexed:
     233          12 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     234          12 :     MulMCID = &TII->get(AArch64::FMLSv4f32);
     235           6 :     break;
     236           4 :   case AArch64::FMULXv4i32_indexed:
     237           8 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     238           8 :     MulMCID = &TII->get(AArch64::FMULXv4f32);
     239           4 :     break;
     240           4 :   case AArch64::FMULv4i32_indexed:
     241           8 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     242           8 :     MulMCID = &TII->get(AArch64::FMULv4f32);
     243           4 :     break;
     244             : 
     245             :   // 2X64 instructions
     246           3 :   case AArch64::FMLAv2i64_indexed:
     247           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     248           6 :     MulMCID = &TII->get(AArch64::FMLAv2f64);
     249           3 :     break;
     250           3 :   case AArch64::FMLSv2i64_indexed:
     251           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     252           6 :     MulMCID = &TII->get(AArch64::FMLSv2f64);
     253           3 :     break;
     254           4 :   case AArch64::FMULXv2i64_indexed:
     255           8 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     256           8 :     MulMCID = &TII->get(AArch64::FMULXv2f64);
     257           4 :     break;
     258           3 :   case AArch64::FMULv2i64_indexed:
     259           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     260           6 :     MulMCID = &TII->get(AArch64::FMULv2f64);
     261           3 :     break;
     262             : 
     263             :   // 2X32 instructions
     264           4 :   case AArch64::FMLAv2i32_indexed:
     265           4 :     RC = &AArch64::FPR64RegClass;
     266           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     267           8 :     MulMCID = &TII->get(AArch64::FMLAv2f32);
     268           4 :     break;
     269           4 :   case AArch64::FMLSv2i32_indexed:
     270           4 :     RC = &AArch64::FPR64RegClass;
     271           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     272           8 :     MulMCID = &TII->get(AArch64::FMLSv2f32);
     273           4 :     break;
     274           4 :   case AArch64::FMULXv2i32_indexed:
     275           4 :     RC = &AArch64::FPR64RegClass;
     276           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     277           8 :     MulMCID = &TII->get(AArch64::FMULXv2f32);
     278           4 :     break;
     279           4 :   case AArch64::FMULv2i32_indexed:
     280           4 :     RC = &AArch64::FPR64RegClass;
     281           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     282           8 :     MulMCID = &TII->get(AArch64::FMULv2f32);
     283           4 :     break;
     284             :   }
     285             : 
     286          49 :   if (!shouldReplaceInstruction(MI.getParent()->getParent(),
     287          98 :                                 &TII->get(MI.getOpcode()), DupMCID, MulMCID,
     288             :                                 *VecInstElemTable))
     289             :     return false;
     290             : 
     291          49 :   const DebugLoc &DL = MI.getDebugLoc();
     292          49 :   MachineBasicBlock &MBB = *MI.getParent();
     293          49 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     294             : 
     295             :   // get the operands of the current SIMD arithmetic instruction.
     296          49 :   unsigned MulDest = MI.getOperand(0).getReg();
     297          49 :   unsigned SrcReg0 = MI.getOperand(1).getReg();
     298         147 :   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
     299          49 :   unsigned SrcReg1 = MI.getOperand(2).getReg();
     300         147 :   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
     301             :   unsigned DupDest;
     302             : 
     303             :   // Instructions of interest have either 4 or 5 operands.
     304          49 :   if (MI.getNumOperands() == 5) {
     305          26 :     unsigned SrcReg2 = MI.getOperand(3).getReg();
     306          78 :     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
     307          26 :     unsigned LaneNumber = MI.getOperand(4).getImm();
     308             : 
     309             :     // Create a new DUP instruction. Note that if an equivalent DUP instruction
     310             :     // has already been created before, then use that one instread of creating
     311             :     // a new one.
     312          52 :     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
     313          25 :       DupDest = MRI.createVirtualRegister(RC);
     314          50 :       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
     315          25 :           .addReg(SrcReg2, Src2IsKill)
     316          50 :           .addImm(LaneNumber);
     317             :     }
     318          52 :     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
     319          26 :         .addReg(SrcReg0, Src0IsKill)
     320          26 :         .addReg(SrcReg1, Src1IsKill)
     321          26 :         .addReg(DupDest, Src2IsKill);
     322          23 :   } else if (MI.getNumOperands() == 4) {
     323          23 :     unsigned LaneNumber = MI.getOperand(3).getImm();
     324          46 :     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
     325          23 :       DupDest = MRI.createVirtualRegister(RC);
     326          46 :       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
     327          23 :           .addReg(SrcReg1, Src1IsKill)
     328          46 :           .addImm(LaneNumber);
     329             :     }
     330          46 :     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
     331          23 :         .addReg(SrcReg0, Src0IsKill)
     332          23 :         .addReg(DupDest, Src1IsKill);
     333             :   } else {
     334             :     return false;
     335             :   }
     336             : 
     337             :   ++NumModifiedInstr;
     338             :   return true;
     339             : }
     340             : 
     341       11003 : bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
     342       11003 :   if (skipFunction(*MF.getFunction()))
     343             :     return false;
     344             : 
     345       11002 :   TII = MF.getSubtarget().getInstrInfo();
     346       11002 :   MRI = &MF.getRegInfo();
     347       11002 :   const TargetSubtargetInfo &ST = MF.getSubtarget();
     348             :   const AArch64InstrInfo *AAII =
     349       11002 :       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
     350       11002 :   if (!AAII)
     351             :     return false;
     352       11002 :   SchedModel.init(ST.getSchedModel(), &ST, AAII);
     353       11002 :   if (!SchedModel.hasInstrSchedModel())
     354             :     return false;
     355             : 
     356             :   // A simple check to exit this pass early for targets that do not need it.
     357        1010 :   if (earlyExitVectElement(&MF))
     358             :     return false;
     359             : 
     360         447 :   bool Changed = false;
     361         447 :   std::map<unsigned, bool> VecInstElemTable;
     362         894 :   SmallVector<MachineInstr *, 8> RemoveMIs;
     363             : 
     364        1855 :   for (MachineBasicBlock &MBB : MF) {
     365        1028 :     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
     366        4624 :          MII != MIE;) {
     367        4110 :       MachineInstr &MI = *MII;
     368        4110 :       if (optimizeVectElement(MI, &VecInstElemTable)) {
     369             :         // Add MI to the list of instructions to be removed given that it has
     370             :         // been replaced.
     371          49 :         RemoveMIs.push_back(&MI);
     372          49 :         Changed = true;
     373             :       }
     374             :       ++MII;
     375             :     }
     376             :   }
     377             : 
     378        1390 :   for (MachineInstr *MI : RemoveMIs)
     379          49 :     MI->eraseFromParent();
     380             : 
     381         447 :   return Changed;
     382             : }
     383             : 
     384             : /// createAArch64VectorByElementOptPass - returns an instance of the
     385             : /// vector by element optimization pass.
     386         914 : FunctionPass *llvm::createAArch64VectorByElementOptPass() {
     387         914 :   return new AArch64VectorByElementOpt();
     388             : }

Generated by: LCOV version 1.13