LCOV - code coverage report
Current view: top level - lib/Target/AArch64 - AArch64AdvSIMDScalarPass.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 144 153 94.1 %
Date: 2017-09-14 15:23:50 Functions: 17 18 94.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : // When profitable, replace GPR targeting i64 instructions with their
      10             : // AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
      11             : // as minimizing the number of cross-class register copies.
      12             : //===----------------------------------------------------------------------===//
      13             : 
      14             : //===----------------------------------------------------------------------===//
      15             : // TODO: Graph based predicate heuristics.
      16             : // Walking the instruction list linearly will get many, perhaps most, of
      17             : // the cases, but to do a truly thorough job of this, we need a more
      18             : // wholistic approach.
      19             : //
      20             : // This optimization is very similar in spirit to the register allocator's
      21             : // spill placement, only here we're determining where to place cross-class
      22             : // register copies rather than spills. As such, a similar approach is
      23             : // called for.
      24             : //
      25             : // We want to build up a set of graphs of all instructions which are candidates
      26             : // for transformation along with instructions which generate their inputs and
      27             : // consume their outputs. For each edge in the graph, we assign a weight
      28             : // based on whether there is a copy required there (weight zero if not) and
      29             : // the block frequency of the block containing the defining or using
      30             : // instruction, whichever is less. Our optimization is then a graph problem
      31             : // to minimize the total weight of all the graphs, then transform instructions
      32             : // and add or remove copy instructions as called for to implement the
      33             : // solution.
      34             : //===----------------------------------------------------------------------===//
      35             : 
      36             : #include "AArch64.h"
      37             : #include "AArch64InstrInfo.h"
      38             : #include "AArch64RegisterInfo.h"
      39             : #include "AArch64Subtarget.h"
      40             : #include "llvm/ADT/Statistic.h"
      41             : #include "llvm/CodeGen/MachineFunction.h"
      42             : #include "llvm/CodeGen/MachineFunctionPass.h"
      43             : #include "llvm/CodeGen/MachineInstr.h"
      44             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      45             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      46             : #include "llvm/Support/CommandLine.h"
      47             : #include "llvm/Support/Debug.h"
      48             : #include "llvm/Support/raw_ostream.h"
      49             : using namespace llvm;
      50             : 
      51             : #define DEBUG_TYPE "aarch64-simd-scalar"
      52             : 
      53             : // Allow forcing all i64 operations with equivalent SIMD instructions to use
      54             : // them. For stress-testing the transformation function.
      55             : static cl::opt<bool>
      56       72306 : TransformAll("aarch64-simd-scalar-force-all",
      57      216918 :              cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
      58      289224 :              cl::init(false), cl::Hidden);
      59             : 
      60             : STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
      61             : STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
      62             : STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
      63             : 
      64             : #define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
      65             : 
      66             : namespace {
      67           5 : class AArch64AdvSIMDScalar : public MachineFunctionPass {
      68             :   MachineRegisterInfo *MRI;
      69             :   const TargetInstrInfo *TII;
      70             : 
      71             : private:
      72             :   // isProfitableToTransform - Predicate function to determine whether an
      73             :   // instruction should be transformed to its equivalent AdvSIMD scalar
      74             :   // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
      75             :   bool isProfitableToTransform(const MachineInstr &MI) const;
      76             : 
      77             :   // transformInstruction - Perform the transformation of an instruction
      78             :   // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
      79             :   // to be the correct register class, minimizing cross-class copies.
      80             :   void transformInstruction(MachineInstr &MI);
      81             : 
      82             :   // processMachineBasicBlock - Main optimzation loop.
      83             :   bool processMachineBasicBlock(MachineBasicBlock *MBB);
      84             : 
      85             : public:
      86             :   static char ID; // Pass identification, replacement for typeid.
      87           5 :   explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
      88           5 :     initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
      89           5 :   }
      90             : 
      91             :   bool runOnMachineFunction(MachineFunction &F) override;
      92             : 
      93           5 :   StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
      94             : 
      95           5 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      96           5 :     AU.setPreservesCFG();
      97           5 :     MachineFunctionPass::getAnalysisUsage(AU);
      98           5 :   }
      99             : };
     100             : char AArch64AdvSIMDScalar::ID = 0;
     101             : } // end anonymous namespace
     102             : 
     103      312568 : INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
     104             :                 AARCH64_ADVSIMD_NAME, false, false)
     105             : 
     106         168 : static bool isGPR64(unsigned Reg, unsigned SubReg,
     107             :                     const MachineRegisterInfo *MRI) {
     108         168 :   if (SubReg)
     109             :     return false;
     110         168 :   if (TargetRegisterInfo::isVirtualRegister(Reg))
     111         336 :     return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
     112           0 :   return AArch64::GPR64RegClass.contains(Reg);
     113             : }
     114             : 
     115         312 : static bool isFPR64(unsigned Reg, unsigned SubReg,
     116             :                     const MachineRegisterInfo *MRI) {
     117         312 :   if (TargetRegisterInfo::isVirtualRegister(Reg))
     118         640 :     return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
     119         576 :             SubReg == 0) ||
     120         672 :            (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
     121             :             SubReg == AArch64::dsub);
     122             :   // Physical register references just check the register class directly.
     123          24 :   return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
     124          16 :          (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
     125             : }
     126             : 
     127             : // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
     128             : // copy instruction. Return zero_reg if the instruction is not a copy.
     129         180 : static MachineOperand *getSrcFromCopy(MachineInstr *MI,
     130             :                                       const MachineRegisterInfo *MRI,
     131             :                                       unsigned &SubReg) {
     132         180 :   SubReg = 0;
     133             :   // The "FMOV Xd, Dn" instruction is the typical form.
     134         540 :   if (MI->getOpcode() == AArch64::FMOVDXr ||
     135         180 :       MI->getOpcode() == AArch64::FMOVXDr)
     136           0 :     return &MI->getOperand(1);
     137             :   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
     138             :   // these at this stage, but it's easy to check for.
     139         180 :   if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
     140           0 :     SubReg = AArch64::dsub;
     141           0 :     return &MI->getOperand(1);
     142             :   }
     143             :   // Or just a plain COPY instruction. This can be directly to/from FPR64,
     144             :   // or it can be a dsub subreg reference to an FPR128.
     145         180 :   if (MI->getOpcode() == AArch64::COPY) {
     146         336 :     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
     147         192 :                 MRI) &&
     148          48 :         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
     149          48 :       return &MI->getOperand(1);
     150         144 :     if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
     151         288 :                 MRI) &&
     152         288 :         isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
     153             :                 MRI)) {
     154         136 :       SubReg = MI->getOperand(1).getSubReg();
     155         272 :       return &MI->getOperand(1);
     156             :     }
     157             :   }
     158             : 
     159             :   // Otherwise, this is some other kind of instruction.
     160             :   return nullptr;
     161             : }
     162             : 
     163             : // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
     164             : // that we're considering transforming to, return that AdvSIMD opcode. For all
     165             : // others, return the original opcode.
     166             : static unsigned getTransformOpcode(unsigned Opc) {
     167         475 :   switch (Opc) {
     168             :   default:
     169             :     break;
     170             :   // FIXME: Lots more possibilities.
     171             :   case AArch64::ADDXrr:
     172             :     return AArch64::ADDv1i64;
     173          28 :   case AArch64::SUBXrr:
     174             :     return AArch64::SUBv1i64;
     175           8 :   case AArch64::ANDXrr:
     176             :     return AArch64::ANDv8i8;
     177           8 :   case AArch64::EORXrr:
     178             :     return AArch64::EORv8i8;
     179           8 :   case AArch64::ORRXrr:
     180             :     return AArch64::ORRv8i8;
     181             :   }
     182             :   // No AdvSIMD equivalent, so just return the original opcode.
     183             :   return Opc;
     184             : }
     185             : 
     186             : static bool isTransformable(const MachineInstr &MI) {
     187         866 :   unsigned Opc = MI.getOpcode();
     188         439 :   return Opc != getTransformOpcode(Opc);
     189             : }
     190             : 
     191             : // isProfitableToTransform - Predicate function to determine whether an
     192             : // instruction should be transformed to its equivalent AdvSIMD scalar
     193             : // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
     194         427 : bool AArch64AdvSIMDScalar::isProfitableToTransform(
     195             :     const MachineInstr &MI) const {
     196             :   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
     197             :   // early exit since that's the common case.
     198         427 :   if (!isTransformable(MI))
     199             :     return false;
     200             : 
     201             :   // Count the number of copies we'll need to add and approximate the number
     202             :   // of copies that a transform will enable us to remove.
     203          36 :   unsigned NumNewCopies = 3;
     204          36 :   unsigned NumRemovableCopies = 0;
     205             : 
     206          36 :   unsigned OrigSrc0 = MI.getOperand(1).getReg();
     207          36 :   unsigned OrigSrc1 = MI.getOperand(2).getReg();
     208             :   unsigned SubReg0;
     209             :   unsigned SubReg1;
     210          72 :   if (!MRI->def_empty(OrigSrc0)) {
     211             :     MachineRegisterInfo::def_instr_iterator Def =
     212          36 :         MRI->def_instr_begin(OrigSrc0);
     213             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     214          72 :     MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     215             :     // If the source was from a copy, we don't need to insert a new copy.
     216          36 :     if (MOSrc0)
     217          32 :       --NumNewCopies;
     218             :     // If there are no other users of the original source, we can delete
     219             :     // that instruction.
     220          36 :     if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
     221             :       ++NumRemovableCopies;
     222             :   }
     223          72 :   if (!MRI->def_empty(OrigSrc1)) {
     224             :     MachineRegisterInfo::def_instr_iterator Def =
     225          36 :         MRI->def_instr_begin(OrigSrc1);
     226             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     227          72 :     MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     228          36 :     if (MOSrc1)
     229          36 :       --NumNewCopies;
     230             :     // If there are no other users of the original source, we can delete
     231             :     // that instruction.
     232          36 :     if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
     233          32 :       ++NumRemovableCopies;
     234             :   }
     235             : 
     236             :   // If any of the uses of the original instructions is a cross class copy,
     237             :   // that's a copy that will be removable if we transform. Likewise, if
     238             :   // any of the uses is a transformable instruction, it's likely the tranforms
     239             :   // will chain, enabling us to save a copy there, too. This is an aggressive
     240             :   // heuristic that approximates the graph based cost analysis described above.
     241          36 :   unsigned Dst = MI.getOperand(0).getReg();
     242          36 :   bool AllUsesAreCopies = true;
     243          36 :   for (MachineRegisterInfo::use_instr_nodbg_iterator
     244          36 :            Use = MRI->use_instr_nodbg_begin(Dst),
     245          36 :            E = MRI->use_instr_nodbg_end();
     246          72 :        Use != E; ++Use) {
     247             :     unsigned SubReg;
     248          96 :     if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
     249          28 :       ++NumRemovableCopies;
     250             :     // If the use is an INSERT_SUBREG, that's still something that can
     251             :     // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
     252             :     // preferable to have it use the FPR64 in most cases, as if the source
     253             :     // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
     254             :     // Ditto for a lane insert.
     255          12 :     else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
     256           4 :              Use->getOpcode() == AArch64::INSvi64gpr)
     257             :       ;
     258             :     else
     259             :       AllUsesAreCopies = false;
     260             :   }
     261             :   // If all of the uses of the original destination register are copies to
     262             :   // FPR64, then we won't end up having a new copy back to GPR64 either.
     263          36 :   if (AllUsesAreCopies)
     264          36 :     --NumNewCopies;
     265             : 
     266             :   // If a transform will not increase the number of cross-class copies required,
     267             :   // return true.
     268          36 :   if (NumNewCopies <= NumRemovableCopies)
     269             :     return true;
     270             : 
     271             :   // Finally, even if we otherwise wouldn't transform, check if we're forcing
     272             :   // transformation of everything.
     273           0 :   return TransformAll;
     274             : }
     275             : 
     276          40 : static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
     277             :                                 unsigned Dst, unsigned Src, bool IsKill) {
     278          80 :   MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
     279         120 :                                     TII->get(AArch64::COPY), Dst)
     280          40 :                                 .addReg(Src, getKillRegState(IsKill));
     281             :   DEBUG(dbgs() << "    adding copy: " << *MIB);
     282          40 :   ++NumCopiesInserted;
     283          40 :   return MIB;
     284             : }
     285             : 
     286             : // transformInstruction - Perform the transformation of an instruction
     287             : // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
     288             : // to be the correct register class, minimizing cross-class copies.
     289          36 : void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
     290             :   DEBUG(dbgs() << "Scalar transform: " << MI);
     291             : 
     292          36 :   MachineBasicBlock *MBB = MI.getParent();
     293          72 :   unsigned OldOpc = MI.getOpcode();
     294          36 :   unsigned NewOpc = getTransformOpcode(OldOpc);
     295             :   assert(OldOpc != NewOpc && "transform an instruction to itself?!");
     296             : 
     297             :   // Check if we need a copy for the source registers.
     298          36 :   unsigned OrigSrc0 = MI.getOperand(1).getReg();
     299          36 :   unsigned OrigSrc1 = MI.getOperand(2).getReg();
     300          36 :   unsigned Src0 = 0, SubReg0;
     301          36 :   unsigned Src1 = 0, SubReg1;
     302          36 :   bool KillSrc0 = false, KillSrc1 = false;
     303          72 :   if (!MRI->def_empty(OrigSrc0)) {
     304             :     MachineRegisterInfo::def_instr_iterator Def =
     305          36 :         MRI->def_instr_begin(OrigSrc0);
     306             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     307          72 :     MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     308             :     // If there are no other users of the original source, we can delete
     309             :     // that instruction.
     310          36 :     if (MOSrc0) {
     311          32 :       Src0 = MOSrc0->getReg();
     312          32 :       KillSrc0 = MOSrc0->isKill();
     313             :       // Src0 is going to be reused, thus, it cannot be killed anymore.
     314          32 :       MOSrc0->setIsKill(false);
     315          32 :       if (MRI->hasOneNonDBGUse(OrigSrc0)) {
     316             :         assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
     317          28 :         Def->eraseFromParent();
     318             :         ++NumCopiesDeleted;
     319             :       }
     320             :     }
     321             :   }
     322          72 :   if (!MRI->def_empty(OrigSrc1)) {
     323             :     MachineRegisterInfo::def_instr_iterator Def =
     324          36 :         MRI->def_instr_begin(OrigSrc1);
     325             :     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
     326          72 :     MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     327             :     // If there are no other users of the original source, we can delete
     328             :     // that instruction.
     329          36 :     if (MOSrc1) {
     330          36 :       Src1 = MOSrc1->getReg();
     331          36 :       KillSrc1 = MOSrc1->isKill();
     332             :       // Src0 is going to be reused, thus, it cannot be killed anymore.
     333          36 :       MOSrc1->setIsKill(false);
     334          36 :       if (MRI->hasOneNonDBGUse(OrigSrc1)) {
     335             :         assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
     336          32 :         Def->eraseFromParent();
     337             :         ++NumCopiesDeleted;
     338             :       }
     339             :     }
     340             :   }
     341             :   // If we weren't able to reference the original source directly, create a
     342             :   // copy.
     343          36 :   if (!Src0) {
     344           4 :     SubReg0 = 0;
     345           4 :     Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     346           4 :     insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
     347           4 :     KillSrc0 = true;
     348             :   }
     349          36 :   if (!Src1) {
     350           0 :     SubReg1 = 0;
     351           0 :     Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     352           0 :     insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
     353           0 :     KillSrc1 = true;
     354             :   }
     355             : 
     356             :   // Create a vreg for the destination.
     357             :   // FIXME: No need to do this if the ultimate user expects an FPR64.
     358             :   // Check for that and avoid the copy if possible.
     359          36 :   unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     360             : 
     361             :   // For now, all of the new instructions have the same simple three-register
     362             :   // form, so no need to special case based on what instruction we're
     363             :   // building.
     364         144 :   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
     365          36 :       .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
     366         108 :       .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
     367             : 
     368             :   // Now copy the result back out to a GPR.
     369             :   // FIXME: Try to avoid this if all uses could actually just use the FPR64
     370             :   // directly.
     371          36 :   insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
     372             : 
     373             :   // Erase the old instruction.
     374          36 :   MI.eraseFromParent();
     375             : 
     376          36 :   ++NumScalarInsnsUsed;
     377          36 : }
     378             : 
     379             : // processMachineBasicBlock - Main optimzation loop.
     380          63 : bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
     381          63 :   bool Changed = false;
     382         616 :   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     383        1281 :     MachineInstr &MI = *I++;
     384         427 :     if (isProfitableToTransform(MI)) {
     385          36 :       transformInstruction(MI);
     386          36 :       Changed = true;
     387             :     }
     388             :   }
     389          63 :   return Changed;
     390             : }
     391             : 
     392             : // runOnMachineFunction - Pass entry point from PassManager.
     393          63 : bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
     394          63 :   bool Changed = false;
     395             :   DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
     396             : 
     397          63 :   if (skipFunction(*mf.getFunction()))
     398             :     return false;
     399             : 
     400          63 :   MRI = &mf.getRegInfo();
     401          63 :   TII = mf.getSubtarget().getInstrInfo();
     402             : 
     403             :   // Just check things on a one-block-at-a-time basis.
     404         126 :   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
     405          63 :     if (processMachineBasicBlock(&*I))
     406          28 :       Changed = true;
     407             :   return Changed;
     408             : }
     409             : 
     410             : // createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
     411             : // to add the pass to the PassManager.
     412           5 : FunctionPass *llvm::createAArch64AdvSIMDScalar() {
     413           5 :   return new AArch64AdvSIMDScalar();
     414      216918 : }

Generated by: LCOV version 1.13