LCOV - llvm-toolchain.info - lib/Target/AArch64/AArch64SIMDInstrOpt.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AArch64 - AArch64SIMDInstrOpt.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	167	237	70.5 %
Date:	2018-10-20 13:21:21	Functions:	8	13	61.5 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //
       2             : //                     The LLVM Compiler Infrastructure
       3             : //
       4             : // This file is distributed under the University of Illinois Open Source
       5             : // License. See LICENSE.TXT for details.
       6             : //
       7             : //===----------------------------------------------------------------------===//
       8             : //
       9             : // This file contains a pass that performs optimization on SIMD instructions
      10             : // with high latency by splitting them into more efficient series of
      11             : // instructions.
      12             : //
      13             : // 1. Rewrite certain SIMD instructions with vector element due to their
      14             : // inefficiency on some targets.
      15             : //
      16             : // For example:
      17             : //    fmla v0.4s, v1.4s, v2.s[1]
      18             : //
      19             : // Is rewritten into:
      20             : //    dup v3.4s, v2.s[1]
      21             : //    fmla v0.4s, v1.4s, v3.4s
      22             : //
      23             : // 2. Rewrite interleaved memory access instructions due to their
      24             : // inefficiency on some targets.
      25             : //
      26             : // For example:
      27             : //    st2 {v0.4s, v1.4s}, addr
      28             : //
      29             : // Is rewritten into:
      30             : //    zip1 v2.4s, v0.4s, v1.4s
      31             : //    zip2 v3.4s, v0.4s, v1.4s
      32             : //    stp  q2, q3,  addr
      33             : //
      34             : //===----------------------------------------------------------------------===//
      35             : 
      36             : #include "AArch64InstrInfo.h"
      37             : #include "llvm/ADT/SmallVector.h"
      38             : #include "llvm/ADT/Statistic.h"
      39             : #include "llvm/ADT/StringRef.h"
      40             : #include "llvm/CodeGen/MachineBasicBlock.h"
      41             : #include "llvm/CodeGen/MachineFunction.h"
      42             : #include "llvm/CodeGen/MachineFunctionPass.h"
      43             : #include "llvm/CodeGen/MachineInstr.h"
      44             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      45             : #include "llvm/CodeGen/MachineOperand.h"
      46             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      47             : #include "llvm/CodeGen/TargetInstrInfo.h"
      48             : #include "llvm/CodeGen/TargetSchedule.h"
      49             : #include "llvm/CodeGen/TargetSubtargetInfo.h"
      50             : #include "llvm/MC/MCInstrDesc.h"
      51             : #include "llvm/MC/MCSchedule.h"
      52             : #include "llvm/Pass.h"
      53             : #include <unordered_map>
      54             : 
      55             : using namespace llvm;
      56             : 
      57             : #define DEBUG_TYPE "aarch64-simdinstr-opt"
      58             : 
      59             : STATISTIC(NumModifiedInstr,
      60             :           "Number of SIMD instructions modified");
      61             : 
      62             : #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
      63             :   "AArch64 SIMD instructions optimization pass"
      64             : 
      65             : namespace {
      66             : 
      67             : struct AArch64SIMDInstrOpt : public MachineFunctionPass {
      68             :   static char ID;
      69             : 
      70             :   const TargetInstrInfo *TII;
      71             :   MachineRegisterInfo *MRI;
      72             :   TargetSchedModel SchedModel;
      73             : 
      74             :   // The two maps below are used to cache decisions instead of recomputing:
      75             :   // This is used to cache instruction replacement decisions within function
      76             :   // units and across function units.
      77             :   std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
      78             :   // This is used to cache the decision of whether to leave the interleaved
      79             :   // store instructions replacement pass early or not for a particular target.
      80             :   std::unordered_map<std::string, bool> InterlEarlyExit;
      81             : 
      82             :   typedef enum {
      83             :     VectorElem,
      84             :     Interleave
      85             :   } Subpass;
      86             : 
      87             :   // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
      88       46914 :   struct InstReplInfo {
      89             :     unsigned OrigOpc;
      90             :                 std::vector<unsigned> ReplOpc;
      91             :     const TargetRegisterClass RC;
      92             :   };
      93             : 
      94             : #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
      95             :   {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
      96             : #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
      97             :                 OpcR7, OpcR8, OpcR9, RC) \
      98             :   {OpcOrg, \
      99             :    {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
     100             : 
     101             :   // The Instruction Replacement Table:
     102             :   std::vector<InstReplInfo> IRT = {
     103             :     // ST2 instructions
     104             :     RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
     105             :           AArch64::STPQi, AArch64::FPR128RegClass),
     106             :     RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
     107             :           AArch64::STPQi, AArch64::FPR128RegClass),
     108             :     RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
     109             :           AArch64::STPDi, AArch64::FPR64RegClass),
     110             :     RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
     111             :           AArch64::STPQi, AArch64::FPR128RegClass),
     112             :     RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
     113             :           AArch64::STPDi, AArch64::FPR64RegClass),
     114             :     RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
     115             :           AArch64::STPQi, AArch64::FPR128RegClass),
     116             :     RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
     117             :           AArch64::STPDi, AArch64::FPR64RegClass),
     118             :     // ST4 instructions
     119             :     RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
     120             :           AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
     121             :           AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
     122             :           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
     123             :     RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
     124             :           AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
     125             :           AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
     126             :           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
     127             :     RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
     128             :           AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
     129             :           AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
     130             :           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
     131             :     RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
     132             :           AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
     133             :           AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
     134             :           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
     135             :     RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
     136             :           AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
     137             :           AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
     138             :           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
     139             :     RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
     140             :           AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
     141             :           AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
     142             :           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
     143             :     RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
     144             :           AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
     145             :           AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
     146             :           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
     147             :   };
     148             : 
     149             :   // A costly instruction is replaced in this work by N efficient instructions
     150             :   // The maximum of N is curently 10 and it is for ST4 case.
     151             :   static const unsigned MaxNumRepl = 10;
     152             : 
     153       16800 :   AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
     154        1120 :     initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
     155        1120 :   }
     156             : 
     157             :   /// Based only on latency of instructions, determine if it is cost efficient
     158             :   /// to replace the instruction InstDesc by the instructions stored in the
     159             :   /// array InstDescRepl.
     160             :   /// Return true if replacement is expected to be faster.
     161             :   bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
     162             :                          SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
     163             : 
     164             :   /// Determine if we need to exit the instruction replacement optimization
     165             :   /// passes early. This makes sure that no compile time is spent in this pass
     166             :   /// for targets with no need for any of these optimizations.
     167             :   /// Return true if early exit of the pass is recommended.
     168             :   bool shouldExitEarly(MachineFunction *MF, Subpass SP);
     169             : 
     170             :   /// Check whether an equivalent DUP instruction has already been
     171             :   /// created or not.
     172             :   /// Return true when the DUP instruction already exists. In this case,
     173             :   /// DestReg will point to the destination of the already created DUP.
     174             :   bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
     175             :                 unsigned LaneNumber, unsigned *DestReg) const;
     176             : 
     177             :   /// Certain SIMD instructions with vector element operand are not efficient.
     178             :   /// Rewrite them into SIMD instructions with vector operands. This rewrite
     179             :   /// is driven by the latency of the instructions.
     180             :   /// Return true if the SIMD instruction is modified.
     181             :   bool optimizeVectElement(MachineInstr &MI);
     182             : 
     183             :   /// Process The REG_SEQUENCE instruction, and extract the source
     184             :   /// operands of the ST2/4 instruction from it.
     185             :   /// Example of such instructions.
     186             :   ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
     187             :   /// Return true when the instruction is processed successfully.
     188             :   bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
     189             :                          unsigned* StRegKill, unsigned NumArg) const;
     190             : 
     191             :   /// Load/Store Interleaving instructions are not always beneficial.
     192             :   /// Replace them by ZIP instructionand classical load/store.
     193             :   /// Return true if the SIMD instruction is modified.
     194             :   bool optimizeLdStInterleave(MachineInstr &MI);
     195             : 
     196             :   /// Return the number of useful source registers for this
     197             :   /// instruction (2 for ST2 and 4 for ST4).
     198             :   unsigned determineSrcReg(MachineInstr &MI) const;
     199             : 
     200             :   bool runOnMachineFunction(MachineFunction &Fn) override;
     201             : 
     202        1114 :   StringRef getPassName() const override {
     203        1114 :     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
     204             :   }
     205             : };
     206             : 
     207             : char AArch64SIMDInstrOpt::ID = 0;
     208             : 
     209             : } // end anonymous namespace
     210             : 
     211      200152 : INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
     212             :                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
     213             : 
     214             : /// Based only on latency of instructions, determine if it is cost efficient
     215             : /// to replace the instruction InstDesc by the instructions stored in the
     216             : /// array InstDescRepl.
     217             : /// Return true if replacement is expected to be faster.
     218           0 : bool AArch64SIMDInstrOpt::
     219             : shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
     220             :                   SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
     221             :   // Check if replacement decision is already available in the cached table.
     222             :   // if so, return it.
     223           0 :   std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
     224           0 :   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
     225           0 :   if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
     226           0 :     return SIMDInstrTable[InstID];
     227             : 
     228           0 :   unsigned SCIdx = InstDesc->getSchedClass();
     229             :   const MCSchedClassDesc *SCDesc =
     230           0 :     SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
     231             : 
     232             :   // If a target does not define resources for the instructions
     233             :   // of interest, then return false for no replacement.
     234             :   const MCSchedClassDesc *SCDescRepl;
     235           0 :   if (!SCDesc->isValid() || SCDesc->isVariant())
     236             :   {
     237           0 :     SIMDInstrTable[InstID] = false;
     238           0 :     return false;
     239             :   }
     240           0 :   for (auto IDesc : InstDescRepl)
     241             :   {
     242           0 :     SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
     243             :       IDesc->getSchedClass());
     244           0 :     if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
     245             :     {
     246           0 :       SIMDInstrTable[InstID] = false;
     247           0 :       return false;
     248             :     }
     249             :   }
     250             : 
     251             :   // Replacement cost.
     252             :   unsigned ReplCost = 0;
     253           0 :   for (auto IDesc :InstDescRepl)
     254           0 :     ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
     255             : 
     256           0 :   if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
     257             :   {
     258           0 :     SIMDInstrTable[InstID] = true;
     259           0 :     return true;
     260             :   }
     261             :   else
     262             :   {
     263           0 :     SIMDInstrTable[InstID] = false;
     264           0 :     return false;
     265             :   }
     266             : }
     267             : 
     268             : /// Determine if we need to exit this pass for a kind of instruction replacement
     269             : /// early. This makes sure that no compile time is spent in this pass for
     270             : /// targets with no need for any of these optimizations beyond performing this
     271             : /// check.
     272             : /// Return true if early exit of this pass for a kind of instruction
     273             : /// replacement is recommended for a target.
     274           0 : bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
     275             :   const MCInstrDesc* OriginalMCID;
     276             :   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
     277             : 
     278           0 :   switch (SP) {
     279             :   // For this optimization, check by comparing the latency of a representative
     280             :   // instruction to that of the replacement instructions.
     281             :   // TODO: check for all concerned instructions.
     282           0 :   case VectorElem:
     283           0 :     OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
     284           0 :     ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
     285           0 :     ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
     286           0 :     if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
     287           0 :       return false;
     288           0 :     break;
     289             : 
     290             :   // For this optimization, check for all concerned instructions.
     291           0 :   case Interleave:
     292           0 :     std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
     293           0 :     if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
     294           0 :       return InterlEarlyExit[Subtarget];
     295             : 
     296           0 :     for (auto &I : IRT) {
     297           0 :       OriginalMCID = &TII->get(I.OrigOpc);
     298           0 :       for (auto &Repl : I.ReplOpc)
     299           0 :         ReplInstrMCID.push_back(&TII->get(Repl));
     300           0 :       if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
     301           0 :         InterlEarlyExit[Subtarget] = false;
     302           0 :         return false;
     303             :       }
     304             :       ReplInstrMCID.clear();
     305             :     }
     306           0 :     InterlEarlyExit[Subtarget] = true;
     307             :     break;
     308             :   }
     309             : 
     310             :   return true;
     311             : }
     312             : 
     313             : /// Check whether an equivalent DUP instruction has already been
     314             : /// created or not.
     315             : /// Return true when the DUP instruction already exists. In this case,
     316             : /// DestReg will point to the destination of the already created DUP.
     317           0 : bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
     318             :                                          unsigned SrcReg, unsigned LaneNumber,
     319             :                                          unsigned *DestReg) const {
     320           0 :   for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
     321           0 :        MII != MIE;) {
     322             :     MII--;
     323             :     MachineInstr *CurrentMI = &*MII;
     324             : 
     325           0 :     if (CurrentMI->getOpcode() == DupOpcode &&
     326           0 :         CurrentMI->getNumOperands() == 3 &&
     327           0 :         CurrentMI->getOperand(1).getReg() == SrcReg &&
     328           0 :         CurrentMI->getOperand(2).getImm() == LaneNumber) {
     329           0 :       *DestReg = CurrentMI->getOperand(0).getReg();
     330           0 :       return true;
     331             :     }
     332             :   }
     333             : 
     334           0 :   return false;
     335             : }
     336             : 
     337             : /// Certain SIMD instructions with vector element operand are not efficient.
     338             : /// Rewrite them into SIMD instructions with vector operands. This rewrite
     339             : /// is driven by the latency of the instructions.
     340             : /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
     341             : /// and FMULX and hence they are hardcoded.
     342             : ///
     343             : /// For example:
     344             : ///    fmla v0.4s, v1.4s, v2.s[1]
     345             : ///
     346             : /// Is rewritten into
     347             : ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
     348             : ///    fmla v0.4s, v1.4s, v3.4s
     349             : ///
     350             : /// Return true if the SIMD instruction is modified.
     351        3211 : bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
     352             :   const MCInstrDesc *MulMCID, *DupMCID;
     353             :   const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
     354             : 
     355        6422 :   switch (MI.getOpcode()) {
     356             :   default:
     357             :     return false;
     358             : 
     359             :   // 4X32 instructions
     360           6 :   case AArch64::FMLAv4i32_indexed:
     361          12 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     362           6 :     MulMCID = &TII->get(AArch64::FMLAv4f32);
     363           6 :     break;
     364           6 :   case AArch64::FMLSv4i32_indexed:
     365          12 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     366           6 :     MulMCID = &TII->get(AArch64::FMLSv4f32);
     367           6 :     break;
     368           4 :   case AArch64::FMULXv4i32_indexed:
     369           8 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     370           4 :     MulMCID = &TII->get(AArch64::FMULXv4f32);
     371           4 :     break;
     372           4 :   case AArch64::FMULv4i32_indexed:
     373           8 :     DupMCID = &TII->get(AArch64::DUPv4i32lane);
     374           4 :     MulMCID = &TII->get(AArch64::FMULv4f32);
     375           4 :     break;
     376             : 
     377             :   // 2X64 instructions
     378           3 :   case AArch64::FMLAv2i64_indexed:
     379           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     380           3 :     MulMCID = &TII->get(AArch64::FMLAv2f64);
     381           3 :     break;
     382           3 :   case AArch64::FMLSv2i64_indexed:
     383           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     384           3 :     MulMCID = &TII->get(AArch64::FMLSv2f64);
     385           3 :     break;
     386           4 :   case AArch64::FMULXv2i64_indexed:
     387           8 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     388           4 :     MulMCID = &TII->get(AArch64::FMULXv2f64);
     389           4 :     break;
     390           3 :   case AArch64::FMULv2i64_indexed:
     391           6 :     DupMCID = &TII->get(AArch64::DUPv2i64lane);
     392           3 :     MulMCID = &TII->get(AArch64::FMULv2f64);
     393           3 :     break;
     394             : 
     395             :   // 2X32 instructions
     396           8 :   case AArch64::FMLAv2i32_indexed:
     397             :     RC = &AArch64::FPR64RegClass;
     398          16 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     399           8 :     MulMCID = &TII->get(AArch64::FMLAv2f32);
     400           8 :     break;
     401           4 :   case AArch64::FMLSv2i32_indexed:
     402             :     RC = &AArch64::FPR64RegClass;
     403           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     404           4 :     MulMCID = &TII->get(AArch64::FMLSv2f32);
     405           4 :     break;
     406           4 :   case AArch64::FMULXv2i32_indexed:
     407             :     RC = &AArch64::FPR64RegClass;
     408           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     409           4 :     MulMCID = &TII->get(AArch64::FMULXv2f32);
     410           4 :     break;
     411           4 :   case AArch64::FMULv2i32_indexed:
     412             :     RC = &AArch64::FPR64RegClass;
     413           8 :     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     414           4 :     MulMCID = &TII->get(AArch64::FMULv2f32);
     415           4 :     break;
     416             :   }
     417             : 
     418             :   SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
     419          53 :   ReplInstrMCID.push_back(DupMCID);
     420          53 :   ReplInstrMCID.push_back(MulMCID);
     421         159 :   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
     422             :                          ReplInstrMCID))
     423             :     return false;
     424             : 
     425             :   const DebugLoc &DL = MI.getDebugLoc();
     426          53 :   MachineBasicBlock &MBB = *MI.getParent();
     427          53 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     428             : 
     429             :   // Get the operands of the current SIMD arithmetic instruction.
     430          53 :   unsigned MulDest = MI.getOperand(0).getReg();
     431          53 :   unsigned SrcReg0 = MI.getOperand(1).getReg();
     432             :   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
     433          53 :   unsigned SrcReg1 = MI.getOperand(2).getReg();
     434             :   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
     435             :   unsigned DupDest;
     436             : 
     437             :   // Instructions of interest have either 4 or 5 operands.
     438          53 :   if (MI.getNumOperands() == 5) {
     439          30 :     unsigned SrcReg2 = MI.getOperand(3).getReg();
     440             :     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
     441          30 :     unsigned LaneNumber = MI.getOperand(4).getImm();
     442             :     // Create a new DUP instruction. Note that if an equivalent DUP instruction
     443             :     // has already been created before, then use that one instead of creating
     444             :     // a new one.
     445          60 :     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
     446          29 :       DupDest = MRI.createVirtualRegister(RC);
     447          58 :       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
     448          29 :           .addReg(SrcReg2, Src2IsKill)
     449             :           .addImm(LaneNumber);
     450             :     }
     451          60 :     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
     452          30 :         .addReg(SrcReg0, Src0IsKill)
     453          30 :         .addReg(SrcReg1, Src1IsKill)
     454          30 :         .addReg(DupDest, Src2IsKill);
     455          23 :   } else if (MI.getNumOperands() == 4) {
     456          23 :     unsigned LaneNumber = MI.getOperand(3).getImm();
     457          46 :     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
     458          23 :       DupDest = MRI.createVirtualRegister(RC);
     459          46 :       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
     460          23 :           .addReg(SrcReg1, Src1IsKill)
     461             :           .addImm(LaneNumber);
     462             :     }
     463          46 :     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
     464          23 :         .addReg(SrcReg0, Src0IsKill)
     465          23 :         .addReg(DupDest, Src1IsKill);
     466             :   } else {
     467             :     return false;
     468             :   }
     469             : 
     470             :   ++NumModifiedInstr;
     471             :   return true;
     472             : }
     473             : 
     474             : /// Load/Store Interleaving instructions are not always beneficial.
     475             : /// Replace them by ZIP instructions and classical load/store.
     476             : ///
     477             : /// For example:
     478             : ///    st2 {v0.4s, v1.4s}, addr
     479             : ///
     480             : /// Is rewritten into:
     481             : ///    zip1 v2.4s, v0.4s, v1.4s
     482             : ///    zip2 v3.4s, v0.4s, v1.4s
     483             : ///    stp  q2, q3, addr
     484             : //
     485             : /// For example:
     486             : ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
     487             : ///
     488             : /// Is rewritten into:
     489             : ///    zip1 v4.4s, v0.4s, v2.4s
     490             : ///    zip2 v5.4s, v0.4s, v2.4s
     491             : ///    zip1 v6.4s, v1.4s, v3.4s
     492             : ///    zip2 v7.4s, v1.4s, v3.4s
     493             : ///    zip1 v8.4s, v4.4s, v6.4s
     494             : ///    zip2 v9.4s, v4.4s, v6.4s
     495             : ///    zip1 v10.4s, v5.4s, v7.4s
     496             : ///    zip2 v11.4s, v5.4s, v7.4s
     497             : ///    stp  q8, q9, addr
     498             : ///    stp  q10, q11, addr+32
     499             : ///
     500             : /// Currently only instructions related to ST2 and ST4 are considered.
     501             : /// Other may be added later.
     502             : /// Return true if the SIMD instruction is modified.
     503        5738 : bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
     504             : 
     505             :   unsigned SeqReg, AddrReg;
     506             :   unsigned StReg[4], StRegKill[4];
     507             :   MachineInstr *DefiningMI;
     508             :   const DebugLoc &DL = MI.getDebugLoc();
     509        5738 :   MachineBasicBlock &MBB = *MI.getParent();
     510             :   SmallVector<unsigned, MaxNumRepl> ZipDest;
     511             :   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
     512             : 
     513             :   // If current instruction matches any of the rewriting rules, then
     514             :   // gather information about parameters of the new instructions.
     515             :   bool Match = false;
     516       85965 :   for (auto &I : IRT) {
     517      160482 :     if (MI.getOpcode() == I.OrigOpc) {
     518          14 :       SeqReg  = MI.getOperand(0).getReg();
     519          14 :       AddrReg = MI.getOperand(1).getReg();
     520          14 :       DefiningMI = MRI->getUniqueVRegDef(SeqReg);
     521          14 :       unsigned NumReg = determineSrcReg(MI);
     522          14 :       if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
     523             :         return false;
     524             : 
     525         105 :       for (auto &Repl : I.ReplOpc) {
     526         182 :         ReplInstrMCID.push_back(&TII->get(Repl));
     527             :         // Generate destination registers but only for non-store instruction.
     528          91 :         if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
     529         140 :           ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
     530             :       }
     531             :       Match = true;
     532             :       break;
     533             :     }
     534             :   }
     535             : 
     536        5738 :   if (!Match)
     537             :     return false;
     538             : 
     539             :   // Determine if it is profitable to replace MI by the series of instructions
     540             :   // represented in ReplInstrMCID.
     541          42 :   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
     542             :                          ReplInstrMCID))
     543             :     return false;
     544             : 
     545             :   // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
     546             :   // this point, the code generation is hardcoded and does not rely on the IRT
     547             :   // table used above given that code generation for ST2 replacement is somewhat
     548             :   // different than for ST4 replacement. We could have added more info into the
     549             :   // table related to how we build new instructions but we may be adding more
     550             :   // complexity with that).
     551          28 :   switch (MI.getOpcode()) {
     552             :   default:
     553             :     return false;
     554             : 
     555             :   case AArch64::ST2Twov16b:
     556             :   case AArch64::ST2Twov8b:
     557             :   case AArch64::ST2Twov8h:
     558             :   case AArch64::ST2Twov4h:
     559             :   case AArch64::ST2Twov4s:
     560             :   case AArch64::ST2Twov2s:
     561             :   case AArch64::ST2Twov2d:
     562             :     // ZIP instructions
     563          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
     564           7 :         .addReg(StReg[0])
     565           7 :         .addReg(StReg[1]);
     566          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
     567           7 :         .addReg(StReg[0], StRegKill[0])
     568           7 :         .addReg(StReg[1], StRegKill[1]);
     569             :     // STP instructions
     570           7 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
     571           7 :         .addReg(ZipDest[0])
     572           7 :         .addReg(ZipDest[1])
     573           7 :         .addReg(AddrReg)
     574             :         .addImm(0);
     575           7 :     break;
     576             : 
     577             :   case AArch64::ST4Fourv16b:
     578             :   case AArch64::ST4Fourv8b:
     579             :   case AArch64::ST4Fourv8h:
     580             :   case AArch64::ST4Fourv4h:
     581             :   case AArch64::ST4Fourv4s:
     582             :   case AArch64::ST4Fourv2s:
     583             :   case AArch64::ST4Fourv2d:
     584             :     // ZIP instructions
     585          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
     586           7 :         .addReg(StReg[0])
     587           7 :         .addReg(StReg[2]);
     588          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
     589           7 :         .addReg(StReg[0], StRegKill[0])
     590           7 :         .addReg(StReg[2], StRegKill[2]);
     591          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
     592           7 :         .addReg(StReg[1])
     593           7 :         .addReg(StReg[3]);
     594          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
     595           7 :         .addReg(StReg[1], StRegKill[1])
     596           7 :         .addReg(StReg[3], StRegKill[3]);
     597          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
     598           7 :         .addReg(ZipDest[0])
     599           7 :         .addReg(ZipDest[2]);
     600          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
     601           7 :         .addReg(ZipDest[0])
     602           7 :         .addReg(ZipDest[2]);
     603          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
     604           7 :         .addReg(ZipDest[1])
     605           7 :         .addReg(ZipDest[3]);
     606          21 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
     607           7 :         .addReg(ZipDest[1])
     608           7 :         .addReg(ZipDest[3]);
     609             :     // stp instructions
     610          14 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
     611           7 :         .addReg(ZipDest[4])
     612           7 :         .addReg(ZipDest[5])
     613           7 :         .addReg(AddrReg)
     614             :         .addImm(0);
     615           7 :     BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
     616           7 :         .addReg(ZipDest[6])
     617           7 :         .addReg(ZipDest[7])
     618           7 :         .addReg(AddrReg)
     619             :         .addImm(2);
     620           7 :     break;
     621             :   }
     622             : 
     623             :   ++NumModifiedInstr;
     624             :   return true;
     625             : }
     626             : 
     627             : /// Process The REG_SEQUENCE instruction, and extract the source
     628             : /// operands of the ST2/4 instruction from it.
     629             : /// Example of such instruction.
     630             : ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
     631             : /// Return true when the instruction is processed successfully.
     632           0 : bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
     633             :      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
     634             :   assert (DefiningMI != NULL);
     635           0 :   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
     636           0 :     return false;
     637             : 
     638           0 :   for (unsigned i=0; i<NumArg; i++) {
     639           0 :     StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();
     640           0 :     StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
     641             : 
     642             :     // Sanity check for the other arguments.
     643           0 :     if (DefiningMI->getOperand(2*i+2).isImm()) {
     644           0 :       switch (DefiningMI->getOperand(2*i+2).getImm()) {
     645           0 :       default:
     646           0 :         return false;
     647             : 
     648             :       case AArch64::dsub0:
     649             :       case AArch64::dsub1:
     650             :       case AArch64::dsub2:
     651             :       case AArch64::dsub3:
     652             :       case AArch64::qsub0:
     653             :       case AArch64::qsub1:
     654             :       case AArch64::qsub2:
     655             :       case AArch64::qsub3:
     656             :         break;
     657             :       }
     658             :     }
     659             :     else
     660           0 :       return false;
     661             :   }
     662             :   return true;
     663             : }
     664             : 
     665             : /// Return the number of useful source registers for this instruction
     666             : /// (2 for ST2 and 4 for ST4).
     667           0 : unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
     668           0 :   switch (MI.getOpcode()) {
     669           0 :   default:
     670           0 :     llvm_unreachable("Unsupported instruction for this pass");
     671             : 
     672             :   case AArch64::ST2Twov16b:
     673             :   case AArch64::ST2Twov8b:
     674             :   case AArch64::ST2Twov8h:
     675             :   case AArch64::ST2Twov4h:
     676             :   case AArch64::ST2Twov4s:
     677             :   case AArch64::ST2Twov2s:
     678             :   case AArch64::ST2Twov2d:
     679             :     return 2;
     680             : 
     681           0 :   case AArch64::ST4Fourv16b:
     682             :   case AArch64::ST4Fourv8b:
     683             :   case AArch64::ST4Fourv8h:
     684             :   case AArch64::ST4Fourv4h:
     685             :   case AArch64::ST4Fourv4s:
     686             :   case AArch64::ST4Fourv2s:
     687             :   case AArch64::ST4Fourv2d:
     688           0 :     return 4;
     689             :   }
     690             : }
     691             : 
     692       14089 : bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
     693       14089 :   if (skipFunction(MF.getFunction()))
     694             :     return false;
     695             : 
     696       14083 :   TII = MF.getSubtarget().getInstrInfo();
     697       14083 :   MRI = &MF.getRegInfo();
     698       14083 :   const TargetSubtargetInfo &ST = MF.getSubtarget();
     699             :   const AArch64InstrInfo *AAII =
     700       14083 :       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
     701       14083 :   if (!AAII)
     702             :     return false;
     703       14083 :   SchedModel.init(&ST);
     704       14083 :   if (!SchedModel.hasInstrSchedModel())
     705             :     return false;
     706             : 
     707             :   bool Changed = false;
     708        6087 :   for (auto OptimizationKind : {VectorElem, Interleave}) {
     709        4058 :     if (!shouldExitEarly(&MF, OptimizationKind)) {
     710             :       SmallVector<MachineInstr *, 8> RemoveMIs;
     711        2561 :       for (MachineBasicBlock &MBB : MF) {
     712             :         for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
     713       10283 :              MII != MIE;) {
     714             :           MachineInstr &MI = *MII;
     715             :           bool InstRewrite;
     716        8949 :           if (OptimizationKind == VectorElem)
     717        3211 :             InstRewrite = optimizeVectElement(MI) ;
     718             :           else
     719        5738 :             InstRewrite = optimizeLdStInterleave(MI);
     720        8949 :           if (InstRewrite) {
     721             :             // Add MI to the list of instructions to be removed given that it
     722             :             // has been replaced.
     723          67 :             RemoveMIs.push_back(&MI);
     724             :             Changed = true;
     725             :           }
     726             :           ++MII;
     727             :         }
     728             :       }
     729        1294 :       for (MachineInstr *MI : RemoveMIs)
     730          67 :         MI->eraseFromParent();
     731             :     }
     732             :   }
     733             : 
     734             :   return Changed;
     735             : }
     736             : 
     737             : /// Returns an instance of the high cost ASIMD instruction replacement
     738             : /// optimization pass.
     739        1120 : FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
     740        1120 :   return new AArch64SIMDInstrOpt();
     741             : }

Generated by: LCOV version 1.13