LCOV - llvm-toolchain.info - lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AArch64 - AArch64AdvSIMDScalarPass.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	52	125	41.6 %
Date:	2018-10-20 13:21:21	Functions:	12	14	85.7 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : // When profitable, replace GPR targeting i64 instructions with their
      10             : // AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
      11             : // as minimizing the number of cross-class register copies.
      12             : //===----------------------------------------------------------------------===//
      13             : 
      14             : //===----------------------------------------------------------------------===//
      15             : // TODO: Graph based predicate heuristics.
      16             : // Walking the instruction list linearly will get many, perhaps most, of
      17             : // the cases, but to do a truly thorough job of this, we need a more
      18             : // wholistic approach.
      19             : //
      20             : // This optimization is very similar in spirit to the register allocator's
      21             : // spill placement, only here we're determining where to place cross-class
      22             : // register copies rather than spills. As such, a similar approach is
      23             : // called for.
      24             : //
      25             : // We want to build up a set of graphs of all instructions which are candidates
      26             : // for transformation along with instructions which generate their inputs and
      27             : // consume their outputs. For each edge in the graph, we assign a weight
      28             : // based on whether there is a copy required there (weight zero if not) and
      29             : // the block frequency of the block containing the defining or using
      30             : // instruction, whichever is less. Our optimization is then a graph problem
      31             : // to minimize the total weight of all the graphs, then transform instructions
      32             : // and add or remove copy instructions as called for to implement the
      33             : // solution.
      34             : //===----------------------------------------------------------------------===//
      35             : 
      36             : #include "AArch64.h"
      37             : #include "AArch64InstrInfo.h"
      38             : #include "AArch64RegisterInfo.h"
      39             : #include "llvm/ADT/Statistic.h"
      40             : #include "llvm/CodeGen/MachineFunction.h"
      41             : #include "llvm/CodeGen/MachineFunctionPass.h"
      42             : #include "llvm/CodeGen/MachineInstr.h"
      43             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      44             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      45             : #include "llvm/Support/CommandLine.h"
      46             : #include "llvm/Support/Debug.h"
      47             : #include "llvm/Support/raw_ostream.h"
      48             : using namespace llvm;
      49             : 
      50             : #define DEBUG_TYPE "aarch64-simd-scalar"
      51             : 
      52             : // Allow forcing all i64 operations with equivalent SIMD instructions to use
      53             : // them. For stress-testing the transformation function.
      54             : static cl::opt<bool>
      55             : TransformAll("aarch64-simd-scalar-force-all",
      56             :              cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
      57             :              cl::init(false), cl::Hidden);
      58             : 
      59             : STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
      60             : STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
      61             : STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
      62             : 
      63             : #define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
      64             : 
      65             : namespace {
      66             : class AArch64AdvSIMDScalar : public MachineFunctionPass {
      67             :   MachineRegisterInfo *MRI;
      68             :   const TargetInstrInfo *TII;
      69             : 
      70             : private:
      71             :   // isProfitableToTransform - Predicate function to determine whether an
      72             :   // instruction should be transformed to its equivalent AdvSIMD scalar
      73             :   // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
      74             :   bool isProfitableToTransform(const MachineInstr &MI) const;
      75             : 
      76             :   // transformInstruction - Perform the transformation of an instruction
      77             :   // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
      78             :   // to be the correct register class, minimizing cross-class copies.
      79             :   void transformInstruction(MachineInstr &MI);
      80             : 
      81             :   // processMachineBasicBlock - Main optimzation loop.
      82             :   bool processMachineBasicBlock(MachineBasicBlock *MBB);
      83             : 
      84             : public:
      85             :   static char ID; // Pass identification, replacement for typeid.
      86           5 :   explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
      87           5 :     initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
      88           5 :   }
      89             : 
      90             :   bool runOnMachineFunction(MachineFunction &F) override;
      91             : 
      92           5 :   StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
      93             : 
      94           5 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      95           5 :     AU.setPreservesCFG();
      96           5 :     MachineFunctionPass::getAnalysisUsage(AU);
      97           5 :   }
      98             : };
      99             : char AArch64AdvSIMDScalar::ID = 0;
     100             : } // end anonymous namespace
     101             : 
     102      199037 : INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
     103             :                 AARCH64_ADVSIMD_NAME, false, false)
     104             : 
     105         168 : static bool isGPR64(unsigned Reg, unsigned SubReg,
     106             :                     const MachineRegisterInfo *MRI) {
     107         168 :   if (SubReg)
     108             :     return false;
     109         168 :   if (TargetRegisterInfo::isVirtualRegister(Reg))
     110         168 :     return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
     111           0 :   return AArch64::GPR64RegClass.contains(Reg);
     112             : }
     113             : 
     114         312 : static bool isFPR64(unsigned Reg, unsigned SubReg,
     115             :                     const MachineRegisterInfo *MRI) {
     116         312 :   if (TargetRegisterInfo::isVirtualRegister(Reg))
     117          32 :     return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
     118         304 :             SubReg == 0) ||
     119         128 :            (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
     120             :             SubReg == AArch64::dsub);
     121             :   // Physical register references just check the register class directly.
     122          16 :   return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
     123           8 :          (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
     124             : }
     125             : 
     126             : // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
     127             : // copy instruction. Return zero_reg if the instruction is not a copy.
     128         180 : static MachineOperand *getSrcFromCopy(MachineInstr *MI,
     129             :                                       const MachineRegisterInfo *MRI,
     130             :                                       unsigned &SubReg) {
     131         180 :   SubReg = 0;
     132             :   // The "FMOV Xd, Dn" instruction is the typical form.
     133         360 :   if (MI->getOpcode() == AArch64::FMOVDXr ||
     134             :       MI->getOpcode() == AArch64::FMOVXDr)
     135           0 :     return &MI->getOperand(1);
     136             :   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
     137             :   // these at this stage, but it's easy to check for.
     138         180 :   if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
     139           0 :     SubReg = AArch64::dsub;
     140           0 :     return &MI->getOperand(1);
     141             :   }
     142             :   // Or just a plain COPY instruction. This can be directly to/from FPR64,
     143             :   // or it can be a dsub subreg reference to an FPR128.
     144         180 :   if (MI->getOpcode() == AArch64::COPY) {
     145         168 :     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
     146         168 :                 MRI) &&
     147          24 :         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
     148          24 :       return &MI->getOperand(1);
     149         144 :     if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
     150         144 :                 MRI) &&
     151         144 :         isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
     152             :                 MRI)) {
     153         136 :       SubReg = MI->getOperand(1).getSubReg();
     154         136 :       return &MI->getOperand(1);
     155             :     }
     156             :   }
     157             : 
     158             :   // Otherwise, this is some other kind of instruction.
     159             :   return nullptr;
     160             : }
     161             : 
     162             : // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
     163             : // that we're considering transforming to, return that AdvSIMD opcode. For all
     164             : // others, return the original opcode.
     165             : static unsigned getTransformOpcode(unsigned Opc) {
     166           0 :   switch (Opc) {
     167             :   default:
     168             :     break;
     169             :   // FIXME: Lots more possibilities.
     170             :   case AArch64::ADDXrr:
     171             :     return AArch64::ADDv1i64;
     172           0 :   case AArch64::SUBXrr:
     173             :     return AArch64::SUBv1i64;
     174           0 :   case AArch64::ANDXrr:
     175             :     return AArch64::ANDv8i8;
     176           0 :   case AArch64::EORXrr:
     177             :     return AArch64::EORv8i8;
     178           0 :   case AArch64::ORRXrr:
     179             :     return AArch64::ORRv8i8;
     180             :   }
     181             :   // No AdvSIMD equivalent, so just return the original opcode.
     182             :   return Opc;
     183             : }
     184             : 
     185             : static bool isTransformable(const MachineInstr &MI) {
     186           0 :   unsigned Opc = MI.getOpcode();
     187             :   return Opc != getTransformOpcode(Opc);
     188             : }
     189             : 
     190             : // isProfitableToTransform - Predicate function to determine whether an
     191             : // instruction should be transformed to its equivalent AdvSIMD scalar
     192             : // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
     193           0 : bool AArch64AdvSIMDScalar::isProfitableToTransform(
     194             :     const MachineInstr &MI) const {
     195             :   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
     196             :   // early exit since that's the common case.
     197           0 :   if (!isTransformable(MI))
     198           0 :     return false;
     199             : 
     200             :   // Count the number of copies we'll need to add and approximate the number
     201             :   // of copies that a transform will enable us to remove.
     202             :   unsigned NumNewCopies = 3;
     203             :   unsigned NumRemovableCopies = 0;
     204             : 
     205           0 :   unsigned OrigSrc0 = MI.getOperand(1).getReg();
     206           0 :   unsigned OrigSrc1 = MI.getOperand(2).getReg();
     207             :   unsigned SubReg0;
     208             :   unsigned SubReg1;
     209           0 :   if (!MRI->def_empty(OrigSrc0)) {
     210             :     MachineRegisterInfo::def_instr_iterator Def =
     211           0 :         MRI->def_instr_begin(OrigSrc0);
     212             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     213           0 :     MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     214             :     // If the source was from a copy, we don't need to insert a new copy.
     215           0 :     if (MOSrc0)
     216             :       --NumNewCopies;
     217             :     // If there are no other users of the original source, we can delete
     218             :     // that instruction.
     219           0 :     if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
     220             :       ++NumRemovableCopies;
     221             :   }
     222           0 :   if (!MRI->def_empty(OrigSrc1)) {
     223             :     MachineRegisterInfo::def_instr_iterator Def =
     224           0 :         MRI->def_instr_begin(OrigSrc1);
     225             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     226           0 :     MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     227           0 :     if (MOSrc1)
     228           0 :       --NumNewCopies;
     229             :     // If there are no other users of the original source, we can delete
     230             :     // that instruction.
     231           0 :     if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
     232           0 :       ++NumRemovableCopies;
     233             :   }
     234             : 
     235             :   // If any of the uses of the original instructions is a cross class copy,
     236             :   // that's a copy that will be removable if we transform. Likewise, if
     237             :   // any of the uses is a transformable instruction, it's likely the tranforms
     238             :   // will chain, enabling us to save a copy there, too. This is an aggressive
     239             :   // heuristic that approximates the graph based cost analysis described above.
     240           0 :   unsigned Dst = MI.getOperand(0).getReg();
     241             :   bool AllUsesAreCopies = true;
     242           0 :   for (MachineRegisterInfo::use_instr_nodbg_iterator
     243           0 :            Use = MRI->use_instr_nodbg_begin(Dst),
     244             :            E = MRI->use_instr_nodbg_end();
     245           0 :        Use != E; ++Use) {
     246             :     unsigned SubReg;
     247           0 :     if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
     248           0 :       ++NumRemovableCopies;
     249             :     // If the use is an INSERT_SUBREG, that's still something that can
     250             :     // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
     251             :     // preferable to have it use the FPR64 in most cases, as if the source
     252             :     // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
     253             :     // Ditto for a lane insert.
     254           0 :     else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
     255             :              Use->getOpcode() == AArch64::INSvi64gpr)
     256             :       ;
     257             :     else
     258             :       AllUsesAreCopies = false;
     259             :   }
     260             :   // If all of the uses of the original destination register are copies to
     261             :   // FPR64, then we won't end up having a new copy back to GPR64 either.
     262           0 :   if (AllUsesAreCopies)
     263           0 :     --NumNewCopies;
     264             : 
     265             :   // If a transform will not increase the number of cross-class copies required,
     266             :   // return true.
     267           0 :   if (NumNewCopies <= NumRemovableCopies)
     268           0 :     return true;
     269             : 
     270             :   // Finally, even if we otherwise wouldn't transform, check if we're forcing
     271             :   // transformation of everything.
     272           0 :   return TransformAll;
     273             : }
     274             : 
     275          40 : static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
     276             :                                 unsigned Dst, unsigned Src, bool IsKill) {
     277          80 :   MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
     278          80 :                                     TII->get(AArch64::COPY), Dst)
     279          40 :                                 .addReg(Src, getKillRegState(IsKill));
     280             :   LLVM_DEBUG(dbgs() << "    adding copy: " << *MIB);
     281             :   ++NumCopiesInserted;
     282          40 :   return MIB;
     283             : }
     284             : 
     285             : // transformInstruction - Perform the transformation of an instruction
     286             : // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
     287             : // to be the correct register class, minimizing cross-class copies.
     288           0 : void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
     289             :   LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);
     290             : 
     291           0 :   MachineBasicBlock *MBB = MI.getParent();
     292           0 :   unsigned OldOpc = MI.getOpcode();
     293             :   unsigned NewOpc = getTransformOpcode(OldOpc);
     294             :   assert(OldOpc != NewOpc && "transform an instruction to itself?!");
     295             : 
     296             :   // Check if we need a copy for the source registers.
     297           0 :   unsigned OrigSrc0 = MI.getOperand(1).getReg();
     298           0 :   unsigned OrigSrc1 = MI.getOperand(2).getReg();
     299             :   unsigned Src0 = 0, SubReg0;
     300             :   unsigned Src1 = 0, SubReg1;
     301             :   bool KillSrc0 = false, KillSrc1 = false;
     302           0 :   if (!MRI->def_empty(OrigSrc0)) {
     303             :     MachineRegisterInfo::def_instr_iterator Def =
     304           0 :         MRI->def_instr_begin(OrigSrc0);
     305             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     306           0 :     MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     307             :     // If there are no other users of the original source, we can delete
     308             :     // that instruction.
     309           0 :     if (MOSrc0) {
     310           0 :       Src0 = MOSrc0->getReg();
     311             :       KillSrc0 = MOSrc0->isKill();
     312             :       // Src0 is going to be reused, thus, it cannot be killed anymore.
     313             :       MOSrc0->setIsKill(false);
     314           0 :       if (MRI->hasOneNonDBGUse(OrigSrc0)) {
     315             :         assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
     316           0 :         Def->eraseFromParent();
     317             :         ++NumCopiesDeleted;
     318             :       }
     319             :     }
     320             :   }
     321           0 :   if (!MRI->def_empty(OrigSrc1)) {
     322             :     MachineRegisterInfo::def_instr_iterator Def =
     323           0 :         MRI->def_instr_begin(OrigSrc1);
     324             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     325           0 :     MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     326             :     // If there are no other users of the original source, we can delete
     327             :     // that instruction.
     328           0 :     if (MOSrc1) {
     329           0 :       Src1 = MOSrc1->getReg();
     330             :       KillSrc1 = MOSrc1->isKill();
     331             :       // Src0 is going to be reused, thus, it cannot be killed anymore.
     332             :       MOSrc1->setIsKill(false);
     333           0 :       if (MRI->hasOneNonDBGUse(OrigSrc1)) {
     334             :         assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
     335           0 :         Def->eraseFromParent();
     336             :         ++NumCopiesDeleted;
     337             :       }
     338             :     }
     339             :   }
     340             :   // If we weren't able to reference the original source directly, create a
     341             :   // copy.
     342           0 :   if (!Src0) {
     343           0 :     SubReg0 = 0;
     344           0 :     Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     345           0 :     insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
     346             :     KillSrc0 = true;
     347             :   }
     348           0 :   if (!Src1) {
     349           0 :     SubReg1 = 0;
     350           0 :     Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     351           0 :     insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
     352             :     KillSrc1 = true;
     353             :   }
     354             : 
     355             :   // Create a vreg for the destination.
     356             :   // FIXME: No need to do this if the ultimate user expects an FPR64.
     357             :   // Check for that and avoid the copy if possible.
     358           0 :   unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     359             : 
     360             :   // For now, all of the new instructions have the same simple three-register
     361             :   // form, so no need to special case based on what instruction we're
     362             :   // building.
     363           0 :   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
     364           0 :       .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
     365           0 :       .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
     366             : 
     367             :   // Now copy the result back out to a GPR.
     368             :   // FIXME: Try to avoid this if all uses could actually just use the FPR64
     369             :   // directly.
     370           0 :   insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
     371             : 
     372             :   // Erase the old instruction.
     373           0 :   MI.eraseFromParent();
     374             : 
     375             :   ++NumScalarInsnsUsed;
     376           0 : }
     377             : 
     378             : // processMachineBasicBlock - Main optimzation loop.
     379          63 : bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
     380             :   bool Changed = false;
     381         490 :   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     382             :     MachineInstr &MI = *I++;
     383         427 :     if (isProfitableToTransform(MI)) {
     384          36 :       transformInstruction(MI);
     385             :       Changed = true;
     386             :     }
     387             :   }
     388          63 :   return Changed;
     389             : }
     390             : 
     391             : // runOnMachineFunction - Pass entry point from PassManager.
     392          63 : bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
     393             :   bool Changed = false;
     394             :   LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
     395             : 
     396          63 :   if (skipFunction(mf.getFunction()))
     397             :     return false;
     398             : 
     399          63 :   MRI = &mf.getRegInfo();
     400          63 :   TII = mf.getSubtarget().getInstrInfo();
     401             : 
     402             :   // Just check things on a one-block-at-a-time basis.
     403         126 :   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
     404          63 :     if (processMachineBasicBlock(&*I))
     405             :       Changed = true;
     406             :   return Changed;
     407             : }
     408             : 
     409             : // createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
     410             : // to add the pass to the PassManager.
     411           5 : FunctionPass *llvm::createAArch64AdvSIMDScalar() {
     412           5 :   return new AArch64AdvSIMDScalar();
     413             : }

Generated by: LCOV version 1.13