LCOV - code coverage report
Current view: top level - lib/Target/AArch64 - AArch64LoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 521 570 91.4 %
Date: 2018-02-18 03:11:45 Functions: 39 39 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file contains a pass that performs load / store related peephole
      11             : // optimizations. This pass should be run after register allocation.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AArch64InstrInfo.h"
      16             : #include "AArch64Subtarget.h"
      17             : #include "MCTargetDesc/AArch64AddressingModes.h"
      18             : #include "llvm/ADT/BitVector.h"
      19             : #include "llvm/ADT/SmallVector.h"
      20             : #include "llvm/ADT/Statistic.h"
      21             : #include "llvm/ADT/StringRef.h"
      22             : #include "llvm/ADT/iterator_range.h"
      23             : #include "llvm/Analysis/AliasAnalysis.h"
      24             : #include "llvm/CodeGen/MachineBasicBlock.h"
      25             : #include "llvm/CodeGen/MachineFunction.h"
      26             : #include "llvm/CodeGen/MachineFunctionPass.h"
      27             : #include "llvm/CodeGen/MachineInstr.h"
      28             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      29             : #include "llvm/CodeGen/MachineOperand.h"
      30             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      31             : #include "llvm/IR/DebugLoc.h"
      32             : #include "llvm/MC/MCRegisterInfo.h"
      33             : #include "llvm/Pass.h"
      34             : #include "llvm/Support/CommandLine.h"
      35             : #include "llvm/Support/Debug.h"
      36             : #include "llvm/Support/ErrorHandling.h"
      37             : #include "llvm/Support/raw_ostream.h"
      38             : #include <cassert>
      39             : #include <cstdint>
      40             : #include <iterator>
      41             : #include <limits>
      42             : 
      43             : using namespace llvm;
      44             : 
      45             : #define DEBUG_TYPE "aarch64-ldst-opt"
      46             : 
      47             : STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
      48             : STATISTIC(NumPostFolded, "Number of post-index updates folded");
      49             : STATISTIC(NumPreFolded, "Number of pre-index updates folded");
      50             : STATISTIC(NumUnscaledPairCreated,
      51             :           "Number of load/store from unscaled generated");
      52             : STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
      53             : STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
      54             : 
      55             : // The LdStLimit limits how far we search for load/store pairs.
      56       97317 : static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
      57       97317 :                                    cl::init(20), cl::Hidden);
      58             : 
      59             : // The UpdateLimit limits how far we search for update instructions when we form
      60             : // pre-/post-index instructions.
      61      291951 : static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
      62      291951 :                                      cl::Hidden);
      63             : 
      64             : #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
      65             : 
      66             : namespace {
      67             : 
      68             : using LdStPairFlags = struct LdStPairFlags {
      69             :   // If a matching instruction is found, MergeForward is set to true if the
      70             :   // merge is to remove the first instruction and replace the second with
      71             :   // a pair-wise insn, and false if the reverse is true.
      72             :   bool MergeForward = false;
      73             : 
      74             :   // SExtIdx gives the index of the result of the load pair that must be
      75             :   // extended. The value of SExtIdx assumes that the paired load produces the
      76             :   // value in this order: (I, returned iterator), i.e., -1 means no value has
      77             :   // to be extended, 0 means I, and 1 means the returned iterator.
      78             :   int SExtIdx = -1;
      79             : 
      80             :   LdStPairFlags() = default;
      81             : 
      82         866 :   void setMergeForward(bool V = true) { MergeForward = V; }
      83             :   bool getMergeForward() const { return MergeForward; }
      84             : 
      85       16653 :   void setSExtIdx(int V) { SExtIdx = V; }
      86             :   int getSExtIdx() const { return SExtIdx; }
      87             : };
      88             : 
      89        2862 : struct AArch64LoadStoreOpt : public MachineFunctionPass {
      90             :   static char ID;
      91             : 
      92         963 :   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
      93         963 :     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
      94         963 :   }
      95             : 
      96             :   AliasAnalysis *AA;
      97             :   const AArch64InstrInfo *TII;
      98             :   const TargetRegisterInfo *TRI;
      99             :   const AArch64Subtarget *Subtarget;
     100             : 
     101             :   // Track which registers have been modified and used.
     102             :   BitVector ModifiedRegs, UsedRegs;
     103             : 
     104         951 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     105             :     AU.addRequired<AAResultsWrapperPass>();
     106         951 :     MachineFunctionPass::getAnalysisUsage(AU);
     107         951 :   }
     108             : 
     109             :   // Scan the instructions looking for a load/store that can be combined
     110             :   // with the current instruction into a load/store pair.
     111             :   // Return the matching instruction if one is found, else MBB->end().
     112             :   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
     113             :                                                LdStPairFlags &Flags,
     114             :                                                unsigned Limit,
     115             :                                                bool FindNarrowMerge);
     116             : 
     117             :   // Scan the instructions looking for a store that writes to the address from
     118             :   // which the current load instruction reads. Return true if one is found.
     119             :   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
     120             :                          MachineBasicBlock::iterator &StoreI);
     121             : 
     122             :   // Merge the two instructions indicated into a wider narrow store instruction.
     123             :   MachineBasicBlock::iterator
     124             :   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     125             :                         MachineBasicBlock::iterator MergeMI,
     126             :                         const LdStPairFlags &Flags);
     127             : 
     128             :   // Merge the two instructions indicated into a single pair-wise instruction.
     129             :   MachineBasicBlock::iterator
     130             :   mergePairedInsns(MachineBasicBlock::iterator I,
     131             :                    MachineBasicBlock::iterator Paired,
     132             :                    const LdStPairFlags &Flags);
     133             : 
     134             :   // Promote the load that reads directly from the address stored to.
     135             :   MachineBasicBlock::iterator
     136             :   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     137             :                        MachineBasicBlock::iterator StoreI);
     138             : 
     139             :   // Scan the instruction list to find a base register update that can
     140             :   // be combined with the current instruction (a load or store) using
     141             :   // pre or post indexed addressing with writeback. Scan forwards.
     142             :   MachineBasicBlock::iterator
     143             :   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
     144             :                                 int UnscaledOffset, unsigned Limit);
     145             : 
     146             :   // Scan the instruction list to find a base register update that can
     147             :   // be combined with the current instruction (a load or store) using
     148             :   // pre or post indexed addressing with writeback. Scan backwards.
     149             :   MachineBasicBlock::iterator
     150             :   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
     151             : 
     152             :   // Find an instruction that updates the base register of the ld/st
     153             :   // instruction.
     154             :   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
     155             :                             unsigned BaseReg, int Offset);
     156             : 
     157             :   // Merge a pre- or post-index base register update into a ld/st instruction.
     158             :   MachineBasicBlock::iterator
     159             :   mergeUpdateInsn(MachineBasicBlock::iterator I,
     160             :                   MachineBasicBlock::iterator Update, bool IsPreIdx);
     161             : 
     162             :   // Find and merge zero store instructions.
     163             :   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
     164             : 
     165             :   // Find and pair ldr/str instructions.
     166             :   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
     167             : 
     168             :   // Find and promote load instructions which read directly from store.
     169             :   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
     170             : 
     171             :   // Find and merge a base register updates before or after a ld/st instruction.
     172             :   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
     173             : 
     174             :   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
     175             : 
     176             :   bool runOnMachineFunction(MachineFunction &Fn) override;
     177             : 
     178         951 :   MachineFunctionProperties getRequiredProperties() const override {
     179        1902 :     return MachineFunctionProperties().set(
     180         951 :         MachineFunctionProperties::Property::NoVRegs);
     181             :   }
     182             : 
     183         958 :   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
     184             : };
     185             : 
     186             : char AArch64LoadStoreOpt::ID = 0;
     187             : 
     188             : } // end anonymous namespace
     189             : 
     190      357416 : INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
     191             :                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
     192             : 
     193             : static bool isNarrowStore(unsigned Opc) {
     194             :   switch (Opc) {
     195             :   default:
     196             :     return false;
     197             :   case AArch64::STRBBui:
     198             :   case AArch64::STURBBi:
     199             :   case AArch64::STRHHui:
     200             :   case AArch64::STURHHi:
     201             :     return true;
     202             :   }
     203             : }
     204             : 
     205             : // Scaling factor for unscaled load or store.
     206       24840 : static int getMemScale(MachineInstr &MI) {
     207       24840 :   switch (MI.getOpcode()) {
     208           0 :   default:
     209           0 :     llvm_unreachable("Opcode has unknown scale!");
     210             :   case AArch64::LDRBBui:
     211             :   case AArch64::LDURBBi:
     212             :   case AArch64::LDRSBWui:
     213             :   case AArch64::LDURSBWi:
     214             :   case AArch64::STRBBui:
     215             :   case AArch64::STURBBi:
     216             :     return 1;
     217         313 :   case AArch64::LDRHHui:
     218             :   case AArch64::LDURHHi:
     219             :   case AArch64::LDRSHWui:
     220             :   case AArch64::LDURSHWi:
     221             :   case AArch64::STRHHui:
     222             :   case AArch64::STURHHi:
     223         313 :     return 2;
     224        3256 :   case AArch64::LDRSui:
     225             :   case AArch64::LDURSi:
     226             :   case AArch64::LDRSWui:
     227             :   case AArch64::LDURSWi:
     228             :   case AArch64::LDRWui:
     229             :   case AArch64::LDURWi:
     230             :   case AArch64::STRSui:
     231             :   case AArch64::STURSi:
     232             :   case AArch64::STRWui:
     233             :   case AArch64::STURWi:
     234             :   case AArch64::LDPSi:
     235             :   case AArch64::LDPSWi:
     236             :   case AArch64::LDPWi:
     237             :   case AArch64::STPSi:
     238             :   case AArch64::STPWi:
     239        3256 :     return 4;
     240       15142 :   case AArch64::LDRDui:
     241             :   case AArch64::LDURDi:
     242             :   case AArch64::LDRXui:
     243             :   case AArch64::LDURXi:
     244             :   case AArch64::STRDui:
     245             :   case AArch64::STURDi:
     246             :   case AArch64::STRXui:
     247             :   case AArch64::STURXi:
     248             :   case AArch64::LDPDi:
     249             :   case AArch64::LDPXi:
     250             :   case AArch64::STPDi:
     251             :   case AArch64::STPXi:
     252       15142 :     return 8;
     253        5608 :   case AArch64::LDRQui:
     254             :   case AArch64::LDURQi:
     255             :   case AArch64::STRQui:
     256             :   case AArch64::STURQi:
     257             :   case AArch64::LDPQi:
     258             :   case AArch64::STPQi:
     259        5608 :     return 16;
     260             :   }
     261             : }
     262             : 
     263       20142 : static unsigned getMatchingNonSExtOpcode(unsigned Opc,
     264             :                                          bool *IsValidLdStrOpc = nullptr) {
     265       20142 :   if (IsValidLdStrOpc)
     266       20136 :     *IsValidLdStrOpc = true;
     267       20142 :   switch (Opc) {
     268        7884 :   default:
     269        7884 :     if (IsValidLdStrOpc)
     270        7884 :       *IsValidLdStrOpc = false;
     271             :     return std::numeric_limits<unsigned>::max();
     272             :   case AArch64::STRDui:
     273             :   case AArch64::STURDi:
     274             :   case AArch64::STRQui:
     275             :   case AArch64::STURQi:
     276             :   case AArch64::STRBBui:
     277             :   case AArch64::STURBBi:
     278             :   case AArch64::STRHHui:
     279             :   case AArch64::STURHHi:
     280             :   case AArch64::STRWui:
     281             :   case AArch64::STURWi:
     282             :   case AArch64::STRXui:
     283             :   case AArch64::STURXi:
     284             :   case AArch64::LDRDui:
     285             :   case AArch64::LDURDi:
     286             :   case AArch64::LDRQui:
     287             :   case AArch64::LDURQi:
     288             :   case AArch64::LDRWui:
     289             :   case AArch64::LDURWi:
     290             :   case AArch64::LDRXui:
     291             :   case AArch64::LDURXi:
     292             :   case AArch64::STRSui:
     293             :   case AArch64::STURSi:
     294             :   case AArch64::LDRSui:
     295             :   case AArch64::LDURSi:
     296             :     return Opc;
     297          14 :   case AArch64::LDRSWui:
     298          14 :     return AArch64::LDRWui;
     299           9 :   case AArch64::LDURSWi:
     300           9 :     return AArch64::LDURWi;
     301             :   }
     302             : }
     303             : 
     304             : static unsigned getMatchingWideOpcode(unsigned Opc) {
     305          14 :   switch (Opc) {
     306           0 :   default:
     307           0 :     llvm_unreachable("Opcode has no wide equivalent!");
     308             :   case AArch64::STRBBui:
     309             :     return AArch64::STRHHui;
     310           0 :   case AArch64::STRHHui:
     311             :     return AArch64::STRWui;
     312           0 :   case AArch64::STURBBi:
     313             :     return AArch64::STURHHi;
     314           0 :   case AArch64::STURHHi:
     315             :     return AArch64::STURWi;
     316           2 :   case AArch64::STURWi:
     317             :     return AArch64::STURXi;
     318          12 :   case AArch64::STRWui:
     319             :     return AArch64::STRXui;
     320             :   }
     321             : }
     322             : 
     323        1110 : static unsigned getMatchingPairOpcode(unsigned Opc) {
     324        1110 :   switch (Opc) {
     325           0 :   default:
     326           0 :     llvm_unreachable("Opcode has no pairwise equivalent!");
     327             :   case AArch64::STRSui:
     328             :   case AArch64::STURSi:
     329             :     return AArch64::STPSi;
     330          63 :   case AArch64::STRDui:
     331             :   case AArch64::STURDi:
     332          63 :     return AArch64::STPDi;
     333         173 :   case AArch64::STRQui:
     334             :   case AArch64::STURQi:
     335         173 :     return AArch64::STPQi;
     336         110 :   case AArch64::STRWui:
     337             :   case AArch64::STURWi:
     338         110 :     return AArch64::STPWi;
     339         211 :   case AArch64::STRXui:
     340             :   case AArch64::STURXi:
     341         211 :     return AArch64::STPXi;
     342          57 :   case AArch64::LDRSui:
     343             :   case AArch64::LDURSi:
     344          57 :     return AArch64::LDPSi;
     345         153 :   case AArch64::LDRDui:
     346             :   case AArch64::LDURDi:
     347         153 :     return AArch64::LDPDi;
     348         126 :   case AArch64::LDRQui:
     349             :   case AArch64::LDURQi:
     350         126 :     return AArch64::LDPQi;
     351          42 :   case AArch64::LDRWui:
     352             :   case AArch64::LDURWi:
     353          42 :     return AArch64::LDPWi;
     354         147 :   case AArch64::LDRXui:
     355             :   case AArch64::LDURXi:
     356         147 :     return AArch64::LDPXi;
     357          10 :   case AArch64::LDRSWui:
     358             :   case AArch64::LDURSWi:
     359          10 :     return AArch64::LDPSWi;
     360             :   }
     361             : }
     362             : 
     363         411 : static unsigned isMatchingStore(MachineInstr &LoadInst,
     364             :                                 MachineInstr &StoreInst) {
     365             :   unsigned LdOpc = LoadInst.getOpcode();
     366             :   unsigned StOpc = StoreInst.getOpcode();
     367         411 :   switch (LdOpc) {
     368           0 :   default:
     369           0 :     llvm_unreachable("Unsupported load instruction!");
     370          18 :   case AArch64::LDRBBui:
     371          18 :     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
     372          18 :            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     373          15 :   case AArch64::LDURBBi:
     374          15 :     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
     375          15 :            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     376          15 :   case AArch64::LDRHHui:
     377          15 :     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
     378          15 :            StOpc == AArch64::STRXui;
     379           7 :   case AArch64::LDURHHi:
     380           7 :     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
     381           7 :            StOpc == AArch64::STURXi;
     382          69 :   case AArch64::LDRWui:
     383          69 :     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     384           4 :   case AArch64::LDURWi:
     385           4 :     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     386         281 :   case AArch64::LDRXui:
     387         281 :     return StOpc == AArch64::STRXui;
     388           2 :   case AArch64::LDURXi:
     389           2 :     return StOpc == AArch64::STURXi;
     390             :   }
     391             : }
     392             : 
     393          89 : static unsigned getPreIndexedOpcode(unsigned Opc) {
     394             :   // FIXME: We don't currently support creating pre-indexed loads/stores when
     395             :   // the load or store is the unscaled version.  If we decide to perform such an
     396             :   // optimization in the future the cases for the unscaled loads/stores will
     397             :   // need to be added here.
     398          89 :   switch (Opc) {
     399           0 :   default:
     400           0 :     llvm_unreachable("Opcode has no pre-indexed equivalent!");
     401             :   case AArch64::STRSui:
     402             :     return AArch64::STRSpre;
     403           6 :   case AArch64::STRDui:
     404           6 :     return AArch64::STRDpre;
     405           8 :   case AArch64::STRQui:
     406           8 :     return AArch64::STRQpre;
     407           2 :   case AArch64::STRBBui:
     408           2 :     return AArch64::STRBBpre;
     409           2 :   case AArch64::STRHHui:
     410           2 :     return AArch64::STRHHpre;
     411           8 :   case AArch64::STRWui:
     412           8 :     return AArch64::STRWpre;
     413           9 :   case AArch64::STRXui:
     414           9 :     return AArch64::STRXpre;
     415           6 :   case AArch64::LDRSui:
     416           6 :     return AArch64::LDRSpre;
     417           6 :   case AArch64::LDRDui:
     418           6 :     return AArch64::LDRDpre;
     419           6 :   case AArch64::LDRQui:
     420           6 :     return AArch64::LDRQpre;
     421           2 :   case AArch64::LDRBBui:
     422           2 :     return AArch64::LDRBBpre;
     423           2 :   case AArch64::LDRHHui:
     424           2 :     return AArch64::LDRHHpre;
     425           6 :   case AArch64::LDRWui:
     426           6 :     return AArch64::LDRWpre;
     427           6 :   case AArch64::LDRXui:
     428           6 :     return AArch64::LDRXpre;
     429           0 :   case AArch64::LDRSWui:
     430           0 :     return AArch64::LDRSWpre;
     431           0 :   case AArch64::LDPSi:
     432           0 :     return AArch64::LDPSpre;
     433           0 :   case AArch64::LDPSWi:
     434           0 :     return AArch64::LDPSWpre;
     435           0 :   case AArch64::LDPDi:
     436           0 :     return AArch64::LDPDpre;
     437           0 :   case AArch64::LDPQi:
     438           0 :     return AArch64::LDPQpre;
     439           2 :   case AArch64::LDPWi:
     440           2 :     return AArch64::LDPWpre;
     441           0 :   case AArch64::LDPXi:
     442           0 :     return AArch64::LDPXpre;
     443           0 :   case AArch64::STPSi:
     444           0 :     return AArch64::STPSpre;
     445           0 :   case AArch64::STPDi:
     446           0 :     return AArch64::STPDpre;
     447           1 :   case AArch64::STPQi:
     448           1 :     return AArch64::STPQpre;
     449           2 :   case AArch64::STPWi:
     450           2 :     return AArch64::STPWpre;
     451           9 :   case AArch64::STPXi:
     452           9 :     return AArch64::STPXpre;
     453             :   }
     454             : }
     455             : 
     456         109 : static unsigned getPostIndexedOpcode(unsigned Opc) {
     457         109 :   switch (Opc) {
     458           0 :   default:
     459           0 :     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
     460             :   case AArch64::STRSui:
     461             :   case AArch64::STURSi:
     462             :     return AArch64::STRSpost;
     463           5 :   case AArch64::STRDui:
     464             :   case AArch64::STURDi:
     465           5 :     return AArch64::STRDpost;
     466          18 :   case AArch64::STRQui:
     467             :   case AArch64::STURQi:
     468          18 :     return AArch64::STRQpost;
     469           2 :   case AArch64::STRBBui:
     470           2 :     return AArch64::STRBBpost;
     471           2 :   case AArch64::STRHHui:
     472           2 :     return AArch64::STRHHpost;
     473          15 :   case AArch64::STRWui:
     474             :   case AArch64::STURWi:
     475          15 :     return AArch64::STRWpost;
     476           7 :   case AArch64::STRXui:
     477             :   case AArch64::STURXi:
     478           7 :     return AArch64::STRXpost;
     479           5 :   case AArch64::LDRSui:
     480             :   case AArch64::LDURSi:
     481           5 :     return AArch64::LDRSpost;
     482           6 :   case AArch64::LDRDui:
     483             :   case AArch64::LDURDi:
     484           6 :     return AArch64::LDRDpost;
     485           9 :   case AArch64::LDRQui:
     486             :   case AArch64::LDURQi:
     487           9 :     return AArch64::LDRQpost;
     488           2 :   case AArch64::LDRBBui:
     489           2 :     return AArch64::LDRBBpost;
     490           2 :   case AArch64::LDRHHui:
     491           2 :     return AArch64::LDRHHpost;
     492           5 :   case AArch64::LDRWui:
     493             :   case AArch64::LDURWi:
     494           5 :     return AArch64::LDRWpost;
     495           8 :   case AArch64::LDRXui:
     496             :   case AArch64::LDURXi:
     497           8 :     return AArch64::LDRXpost;
     498           0 :   case AArch64::LDRSWui:
     499           0 :     return AArch64::LDRSWpost;
     500           0 :   case AArch64::LDPSi:
     501           0 :     return AArch64::LDPSpost;
     502           1 :   case AArch64::LDPSWi:
     503           1 :     return AArch64::LDPSWpost;
     504           0 :   case AArch64::LDPDi:
     505           0 :     return AArch64::LDPDpost;
     506           0 :   case AArch64::LDPQi:
     507           0 :     return AArch64::LDPQpost;
     508           0 :   case AArch64::LDPWi:
     509           0 :     return AArch64::LDPWpost;
     510           2 :   case AArch64::LDPXi:
     511           2 :     return AArch64::LDPXpost;
     512           2 :   case AArch64::STPSi:
     513           2 :     return AArch64::STPSpost;
     514           4 :   case AArch64::STPDi:
     515           4 :     return AArch64::STPDpost;
     516           2 :   case AArch64::STPQi:
     517           2 :     return AArch64::STPQpost;
     518           2 :   case AArch64::STPWi:
     519           2 :     return AArch64::STPWpost;
     520           5 :   case AArch64::STPXi:
     521           5 :     return AArch64::STPXpost;
     522             :   }
     523             : }
     524             : 
     525       80446 : static bool isPairedLdSt(const MachineInstr &MI) {
     526       80446 :   switch (MI.getOpcode()) {
     527             :   default:
     528             :     return false;
     529       17221 :   case AArch64::LDPSi:
     530             :   case AArch64::LDPSWi:
     531             :   case AArch64::LDPDi:
     532             :   case AArch64::LDPQi:
     533             :   case AArch64::LDPWi:
     534             :   case AArch64::LDPXi:
     535             :   case AArch64::STPSi:
     536             :   case AArch64::STPDi:
     537             :   case AArch64::STPQi:
     538             :   case AArch64::STPWi:
     539             :   case AArch64::STPXi:
     540       17221 :     return true;
     541             :   }
     542             : }
     543             : 
     544             : static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
     545             :                                           unsigned PairedRegOp = 0) {
     546             :   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
     547       22989 :   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
     548             :   return MI.getOperand(Idx);
     549             : }
     550             : 
     551             : static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
     552       34750 :   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
     553             :   return MI.getOperand(Idx);
     554             : }
     555             : 
     556             : static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
     557       59772 :   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
     558             :   return MI.getOperand(Idx);
     559             : }
     560             : 
     561          96 : static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
     562             :                                   MachineInstr &StoreInst,
     563             :                                   const AArch64InstrInfo *TII) {
     564             :   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
     565          96 :   int LoadSize = getMemScale(LoadInst);
     566          96 :   int StoreSize = getMemScale(StoreInst);
     567             :   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
     568             :                              ? getLdStOffsetOp(StoreInst).getImm()
     569         192 :                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
     570             :   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
     571             :                              ? getLdStOffsetOp(LoadInst).getImm()
     572         192 :                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
     573         187 :   return (UnscaledStOffset <= UnscaledLdOffset) &&
     574         187 :          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
     575             : }
     576             : 
     577             : static bool isPromotableZeroStoreInst(MachineInstr &MI) {
     578             :   unsigned Opc = MI.getOpcode();
     579       70294 :   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
     580       70294 :           isNarrowStore(Opc)) &&
     581             :          getLdStRegOp(MI).getReg() == AArch64::WZR;
     582             : }
     583             : 
     584       65374 : static bool isPromotableLoadFromStore(MachineInstr &MI) {
     585       65374 :   switch (MI.getOpcode()) {
     586             :   default:
     587             :     return false;
     588             :   // Scaled instructions.
     589        2205 :   case AArch64::LDRBBui:
     590             :   case AArch64::LDRHHui:
     591             :   case AArch64::LDRWui:
     592             :   case AArch64::LDRXui:
     593             :   // Unscaled instructions.
     594             :   case AArch64::LDURBBi:
     595             :   case AArch64::LDURHHi:
     596             :   case AArch64::LDURWi:
     597             :   case AArch64::LDURXi:
     598        2205 :     return true;
     599             :   }
     600             : }
     601             : 
     602       64372 : static bool isMergeableLdStUpdate(MachineInstr &MI) {
     603             :   unsigned Opc = MI.getOpcode();
     604       64372 :   switch (Opc) {
     605             :   default:
     606             :     return false;
     607             :   // Scaled instructions.
     608             :   case AArch64::STRSui:
     609             :   case AArch64::STRDui:
     610             :   case AArch64::STRQui:
     611             :   case AArch64::STRXui:
     612             :   case AArch64::STRWui:
     613             :   case AArch64::STRHHui:
     614             :   case AArch64::STRBBui:
     615             :   case AArch64::LDRSui:
     616             :   case AArch64::LDRDui:
     617             :   case AArch64::LDRQui:
     618             :   case AArch64::LDRXui:
     619             :   case AArch64::LDRWui:
     620             :   case AArch64::LDRHHui:
     621             :   case AArch64::LDRBBui:
     622             :   // Unscaled instructions.
     623             :   case AArch64::STURSi:
     624             :   case AArch64::STURDi:
     625             :   case AArch64::STURQi:
     626             :   case AArch64::STURWi:
     627             :   case AArch64::STURXi:
     628             :   case AArch64::LDURSi:
     629             :   case AArch64::LDURDi:
     630             :   case AArch64::LDURQi:
     631             :   case AArch64::LDURWi:
     632             :   case AArch64::LDURXi:
     633             :   // Paired instructions.
     634             :   case AArch64::LDPSi:
     635             :   case AArch64::LDPSWi:
     636             :   case AArch64::LDPDi:
     637             :   case AArch64::LDPQi:
     638             :   case AArch64::LDPWi:
     639             :   case AArch64::LDPXi:
     640             :   case AArch64::STPSi:
     641             :   case AArch64::STPDi:
     642             :   case AArch64::STPQi:
     643             :   case AArch64::STPWi:
     644             :   case AArch64::STPXi:
     645             :     // Make sure this is a reg+imm (as opposed to an address reloc).
     646        9726 :     if (!getLdStOffsetOp(MI).isImm())
     647             :       return false;
     648             : 
     649        8201 :     return true;
     650             :   }
     651             : }
     652             : 
     653             : MachineBasicBlock::iterator
     654          14 : AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     655             :                                            MachineBasicBlock::iterator MergeMI,
     656             :                                            const LdStPairFlags &Flags) {
     657             :   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
     658             :          "Expected promotable zero stores.");
     659             : 
     660          14 :   MachineBasicBlock::iterator NextI = I;
     661             :   ++NextI;
     662             :   // If NextI is the second of the two instructions to be merged, we need
     663             :   // to skip one further. Either way we merge will invalidate the iterator,
     664             :   // and we don't need to scan the new instruction, as it's a pairwise
     665             :   // instruction, which we're not considering for further action anyway.
     666          14 :   if (NextI == MergeMI)
     667             :     ++NextI;
     668             : 
     669             :   unsigned Opc = I->getOpcode();
     670          14 :   bool IsScaled = !TII->isUnscaledLdSt(Opc);
     671          14 :   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
     672             : 
     673             :   bool MergeForward = Flags.getMergeForward();
     674             :   // Insert our new paired instruction after whichever of the paired
     675             :   // instructions MergeForward indicates.
     676          14 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
     677             :   // Also based on MergeForward is from where we copy the base register operand
     678             :   // so we get the flags compatible with the input code.
     679             :   const MachineOperand &BaseRegOp =
     680          14 :       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
     681             : 
     682             :   // Which register is Rt and which is Rt2 depends on the offset order.
     683             :   MachineInstr *RtMI;
     684          14 :   if (getLdStOffsetOp(*I).getImm() ==
     685          14 :       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
     686             :     RtMI = &*MergeMI;
     687             :   else
     688             :     RtMI = &*I;
     689             : 
     690          14 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     691             :   // Change the scaled offset from small to large type.
     692          14 :   if (IsScaled) {
     693             :     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
     694          12 :     OffsetImm /= 2;
     695             :   }
     696             : 
     697             :   // Construct the new instruction.
     698             :   DebugLoc DL = I->getDebugLoc();
     699             :   MachineBasicBlock *MBB = I->getParent();
     700             :   MachineInstrBuilder MIB;
     701          42 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
     702          14 :             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
     703             :             .add(BaseRegOp)
     704          14 :             .addImm(OffsetImm)
     705          14 :             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
     706             :   (void)MIB;
     707             : 
     708             :   DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
     709             :   DEBUG(I->print(dbgs()));
     710             :   DEBUG(dbgs() << "    ");
     711             :   DEBUG(MergeMI->print(dbgs()));
     712             :   DEBUG(dbgs() << "  with instruction:\n    ");
     713             :   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     714             :   DEBUG(dbgs() << "\n");
     715             : 
     716             :   // Erase the old instructions.
     717          14 :   I->eraseFromParent();
     718          14 :   MergeMI->eraseFromParent();
     719          28 :   return NextI;
     720             : }
     721             : 
     722             : MachineBasicBlock::iterator
     723         852 : AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     724             :                                       MachineBasicBlock::iterator Paired,
     725             :                                       const LdStPairFlags &Flags) {
     726         852 :   MachineBasicBlock::iterator NextI = I;
     727             :   ++NextI;
     728             :   // If NextI is the second of the two instructions to be merged, we need
     729             :   // to skip one further. Either way we merge will invalidate the iterator,
     730             :   // and we don't need to scan the new instruction, as it's a pairwise
     731             :   // instruction, which we're not considering for further action anyway.
     732         852 :   if (NextI == Paired)
     733             :     ++NextI;
     734             : 
     735             :   int SExtIdx = Flags.getSExtIdx();
     736             :   unsigned Opc =
     737         858 :       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
     738         852 :   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
     739         891 :   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
     740             : 
     741             :   bool MergeForward = Flags.getMergeForward();
     742             :   // Insert our new paired instruction after whichever of the paired
     743             :   // instructions MergeForward indicates.
     744         852 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
     745             :   // Also based on MergeForward is from where we copy the base register operand
     746             :   // so we get the flags compatible with the input code.
     747             :   const MachineOperand &BaseRegOp =
     748         852 :       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
     749             : 
     750         852 :   int Offset = getLdStOffsetOp(*I).getImm();
     751         852 :   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
     752         852 :   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
     753         852 :   if (IsUnscaled != PairedIsUnscaled) {
     754             :     // We're trying to pair instructions that differ in how they are scaled.  If
     755             :     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
     756             :     // the opposite (i.e., make Paired's offset unscaled).
     757           9 :     int MemSize = getMemScale(*Paired);
     758           9 :     if (PairedIsUnscaled) {
     759             :       // If the unscaled offset isn't a multiple of the MemSize, we can't
     760             :       // pair the operations together.
     761             :       assert(!(PairedOffset % getMemScale(*Paired)) &&
     762             :              "Offset should be a multiple of the stride!");
     763           5 :       PairedOffset /= MemSize;
     764             :     } else {
     765           4 :       PairedOffset *= MemSize;
     766             :     }
     767             :   }
     768             : 
     769             :   // Which register is Rt and which is Rt2 depends on the offset order.
     770             :   MachineInstr *RtMI, *Rt2MI;
     771         852 :   if (Offset == PairedOffset + OffsetStride) {
     772             :     RtMI = &*Paired;
     773             :     Rt2MI = &*I;
     774             :     // Here we swapped the assumption made for SExtIdx.
     775             :     // I.e., we turn ldp I, Paired into ldp Paired, I.
     776             :     // Update the index accordingly.
     777         251 :     if (SExtIdx != -1)
     778           0 :       SExtIdx = (SExtIdx + 1) % 2;
     779             :   } else {
     780             :     RtMI = &*I;
     781             :     Rt2MI = &*Paired;
     782             :   }
     783         852 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     784             :   // Scale the immediate offset, if necessary.
     785         852 :   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
     786             :     assert(!(OffsetImm % getMemScale(*RtMI)) &&
     787             :            "Unscaled offset cannot be scaled.");
     788          44 :     OffsetImm /= getMemScale(*RtMI);
     789             :   }
     790             : 
     791             :   // Construct the new instruction.
     792             :   MachineInstrBuilder MIB;
     793             :   DebugLoc DL = I->getDebugLoc();
     794             :   MachineBasicBlock *MBB = I->getParent();
     795         852 :   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
     796         852 :   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
     797             :   // Kill flags may become invalid when moving stores for pairing.
     798         852 :   if (RegOp0.isUse()) {
     799         418 :     if (!MergeForward) {
     800             :       // Clear kill flags on store if moving upwards. Example:
     801             :       //   STRWui %w0, ...
     802             :       //   USE %w1
     803             :       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
     804             :       RegOp0.setIsKill(false);
     805             :       RegOp1.setIsKill(false);
     806             :     } else {
     807             :       // Clear kill flags of the first stores register. Example:
     808             :       //   STRWui %w1, ...
     809             :       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
     810             :       //   STRW %w0
     811             :       unsigned Reg = getLdStRegOp(*I).getReg();
     812         146 :       for (MachineInstr &MI : make_range(std::next(I), Paired))
     813          76 :         MI.clearRegisterKills(Reg, TRI);
     814             :     }
     815             :   }
     816        1704 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
     817             :             .add(RegOp0)
     818             :             .add(RegOp1)
     819             :             .add(BaseRegOp)
     820         852 :             .addImm(OffsetImm)
     821         852 :             .setMemRefs(I->mergeMemRefsWith(*Paired));
     822             : 
     823             :   (void)MIB;
     824             : 
     825             :   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
     826             :   DEBUG(I->print(dbgs()));
     827             :   DEBUG(dbgs() << "    ");
     828             :   DEBUG(Paired->print(dbgs()));
     829             :   DEBUG(dbgs() << "  with instruction:\n    ");
     830         852 :   if (SExtIdx != -1) {
     831             :     // Generate the sign extension for the proper result of the ldp.
     832             :     // I.e., with X1, that would be:
     833             :     // %w1 = KILL %w1, implicit-def %x1
     834             :     // %x1 = SBFMXri killed %x1, 0, 31
     835           6 :     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
     836             :     // Right now, DstMO has the extended register, since it comes from an
     837             :     // extended opcode.
     838             :     unsigned DstRegX = DstMO.getReg();
     839             :     // Get the W variant of that register.
     840           6 :     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
     841             :     // Update the result of LDP to use the W instead of the X variant.
     842           6 :     DstMO.setReg(DstRegW);
     843             :     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     844             :     DEBUG(dbgs() << "\n");
     845             :     // Make the machine verifier happy by providing a definition for
     846             :     // the X register.
     847             :     // Insert this definition right after the generated LDP, i.e., before
     848             :     // InsertionPoint.
     849             :     MachineInstrBuilder MIBKill =
     850          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
     851           6 :             .addReg(DstRegW)
     852           6 :             .addReg(DstRegX, RegState::Define);
     853             :     MIBKill->getOperand(2).setImplicit();
     854             :     // Create the sign extension.
     855             :     MachineInstrBuilder MIBSXTW =
     856          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
     857           6 :             .addReg(DstRegX)
     858             :             .addImm(0)
     859             :             .addImm(31);
     860             :     (void)MIBSXTW;
     861             :     DEBUG(dbgs() << "  Extend operand:\n    ");
     862             :     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
     863             :   } else {
     864             :     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     865             :   }
     866             :   DEBUG(dbgs() << "\n");
     867             : 
     868             :   // Erase the old instructions.
     869         852 :   I->eraseFromParent();
     870         852 :   Paired->eraseFromParent();
     871             : 
     872        1704 :   return NextI;
     873             : }
     874             : 
     875             : MachineBasicBlock::iterator
     876          62 : AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     877             :                                           MachineBasicBlock::iterator StoreI) {
     878          62 :   MachineBasicBlock::iterator NextI = LoadI;
     879             :   ++NextI;
     880             : 
     881          62 :   int LoadSize = getMemScale(*LoadI);
     882          62 :   int StoreSize = getMemScale(*StoreI);
     883             :   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
     884             :   const MachineOperand &StMO = getLdStRegOp(*StoreI);
     885             :   unsigned StRt = getLdStRegOp(*StoreI).getReg();
     886          62 :   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
     887             : 
     888             :   assert((IsStoreXReg ||
     889             :           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
     890             :          "Unexpected RegClass");
     891             : 
     892             :   MachineInstr *BitExtMI;
     893          62 :   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
     894             :     // Remove the load, if the destination register of the loads is the same
     895             :     // register for stored value.
     896          12 :     if (StRt == LdRt && LoadSize == 8) {
     897             :       for (MachineInstr &MI : make_range(StoreI->getIterator(),
     898           2 :                                          LoadI->getIterator())) {
     899           6 :         if (MI.killsRegister(StRt, TRI)) {
     900           2 :           MI.clearRegisterKills(StRt, TRI);
     901           2 :           break;
     902             :         }
     903             :       }
     904             :       DEBUG(dbgs() << "Remove load instruction:\n    ");
     905             :       DEBUG(LoadI->print(dbgs()));
     906             :       DEBUG(dbgs() << "\n");
     907           2 :       LoadI->eraseFromParent();
     908           2 :       return NextI;
     909             :     }
     910             :     // Replace the load with a mov if the load and store are in the same size.
     911             :     BitExtMI =
     912          20 :         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     913          10 :                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
     914          10 :             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
     915             :             .add(StMO)
     916          10 :             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     917             :   } else {
     918             :     // FIXME: Currently we disable this transformation in big-endian targets as
     919             :     // performance and correctness are verified only in little-endian.
     920          50 :     if (!Subtarget->isLittleEndian())
     921           0 :       return NextI;
     922             :     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
     923             :     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
     924             :            "Unsupported ld/st match");
     925             :     assert(LoadSize <= StoreSize && "Invalid load size");
     926             :     int UnscaledLdOffset = IsUnscaled
     927             :                                ? getLdStOffsetOp(*LoadI).getImm()
     928         100 :                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
     929             :     int UnscaledStOffset = IsUnscaled
     930             :                                ? getLdStOffsetOp(*StoreI).getImm()
     931         100 :                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     932          50 :     int Width = LoadSize * 8;
     933          50 :     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     934          50 :     int Imms = Immr + Width - 1;
     935             :     unsigned DestReg = IsStoreXReg
     936          50 :                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
     937             :                                                       &AArch64::GPR64RegClass)
     938             :                            : LdRt;
     939             : 
     940             :     assert((UnscaledLdOffset >= UnscaledStOffset &&
     941             :             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
     942             :            "Invalid offset");
     943             : 
     944             :     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     945             :     Imms = Immr + Width - 1;
     946          50 :     if (UnscaledLdOffset == UnscaledStOffset) {
     947          32 :       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
     948          16 :                                 | ((Immr) << 6)               // immr
     949          16 :                                 | ((Imms) << 0)               // imms
     950             :           ;
     951             : 
     952             :       BitExtMI =
     953          16 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     954          16 :                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
     955          16 :                   DestReg)
     956             :               .add(StMO)
     957          16 :               .addImm(AndMaskEncoded);
     958             :     } else {
     959             :       BitExtMI =
     960          34 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     961          34 :                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
     962          34 :                   DestReg)
     963             :               .add(StMO)
     964          34 :               .addImm(Immr)
     965          34 :               .addImm(Imms);
     966             :     }
     967             :   }
     968             : 
     969             :   // Clear kill flags between store and load.
     970             :   for (MachineInstr &MI : make_range(StoreI->getIterator(),
     971          60 :                                      BitExtMI->getIterator()))
     972         124 :     if (MI.killsRegister(StRt, TRI)) {
     973          58 :       MI.clearRegisterKills(StRt, TRI);
     974          58 :       break;
     975             :     }
     976             : 
     977             :   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
     978             :   DEBUG(StoreI->print(dbgs()));
     979             :   DEBUG(dbgs() << "    ");
     980             :   DEBUG(LoadI->print(dbgs()));
     981             :   DEBUG(dbgs() << "  with instructions:\n    ");
     982             :   DEBUG(StoreI->print(dbgs()));
     983             :   DEBUG(dbgs() << "    ");
     984             :   DEBUG((BitExtMI)->print(dbgs()));
     985             :   DEBUG(dbgs() << "\n");
     986             : 
     987             :   // Erase the old instructions.
     988          60 :   LoadI->eraseFromParent();
     989          60 :   return NextI;
     990             : }
     991             : 
     992             : /// trackRegDefsUses - Remember what registers the specified instruction uses
     993             : /// and modifies.
     994       49476 : static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
     995             :                              BitVector &UsedRegs,
     996             :                              const TargetRegisterInfo *TRI) {
     997      382664 :   for (const MachineOperand &MO : MI.operands()) {
     998      166594 :     if (MO.isRegMask())
     999             :       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
    1000             : 
    1001      166594 :     if (!MO.isReg())
    1002       43150 :       continue;
    1003             :     unsigned Reg = MO.getReg();
    1004      123444 :     if (!Reg)
    1005          20 :       continue;
    1006      123424 :     if (MO.isDef()) {
    1007             :       // WZR/XZR are not modified even when used as a destination register.
    1008       45489 :       if (Reg != AArch64::WZR && Reg != AArch64::XZR)
    1009      761429 :         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    1010             :           ModifiedRegs.set(*AI);
    1011             :     } else {
    1012             :       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
    1013     1160397 :       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    1014             :         UsedRegs.set(*AI);
    1015             :     }
    1016             :   }
    1017       49476 : }
    1018             : 
    1019             : static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
    1020             :   // Convert the byte-offset used by unscaled into an "element" offset used
    1021             :   // by the scaled pair load/store instructions.
    1022        6699 :   if (IsUnscaled) {
    1023             :     // If the byte-offset isn't a multiple of the stride, there's no point
    1024             :     // trying to match it.
    1025         275 :     if (Offset % OffsetStride)
    1026             :       return false;
    1027         197 :     Offset /= OffsetStride;
    1028             :   }
    1029        6621 :   return Offset <= 63 && Offset >= -64;
    1030             : }
    1031             : 
    1032             : // Do alignment, specialized to power of 2 and for signed ints,
    1033             : // avoiding having to do a C-style cast from uint_64t to int when
    1034             : // using alignTo from include/llvm/Support/MathExtras.h.
    1035             : // FIXME: Move this function to include/MathExtras.h?
    1036             : static int alignTo(int Num, int PowOf2) {
    1037          89 :   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
    1038             : }
    1039             : 
    1040         464 : static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
    1041             :                      AliasAnalysis *AA) {
    1042             :   // One of the instructions must modify memory.
    1043         464 :   if (!MIa.mayStore() && !MIb.mayStore())
    1044             :     return false;
    1045             : 
    1046             :   // Both instructions must be memory operations.
    1047         436 :   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
    1048             :     return false;
    1049             : 
    1050         436 :   return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
    1051             : }
    1052             : 
    1053             : static bool mayAlias(MachineInstr &MIa,
    1054             :                      SmallVectorImpl<MachineInstr *> &MemInsns,
    1055             :                      AliasAnalysis *AA) {
    1056        1117 :   for (MachineInstr *MIb : MemInsns)
    1057         141 :     if (mayAlias(MIa, *MIb, AA))
    1058             :       return true;
    1059             : 
    1060             :   return false;
    1061             : }
    1062             : 
    1063        1262 : bool AArch64LoadStoreOpt::findMatchingStore(
    1064             :     MachineBasicBlock::iterator I, unsigned Limit,
    1065             :     MachineBasicBlock::iterator &StoreI) {
    1066             :   MachineBasicBlock::iterator B = I->getParent()->begin();
    1067        1262 :   MachineBasicBlock::iterator MBBI = I;
    1068             :   MachineInstr &LoadMI = *I;
    1069             :   unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
    1070             : 
    1071             :   // If the load is the first instruction in the block, there's obviously
    1072             :   // not any matching store.
    1073        1262 :   if (MBBI == B)
    1074             :     return false;
    1075             : 
    1076             :   // Track which registers have been modified and used between the first insn
    1077             :   // and the second insn.
    1078             :   ModifiedRegs.reset();
    1079             :   UsedRegs.reset();
    1080             : 
    1081             :   unsigned Count = 0;
    1082             :   do {
    1083             :     --MBBI;
    1084             :     MachineInstr &MI = *MBBI;
    1085             : 
    1086             :     // Don't count transient instructions towards the search limit since there
    1087             :     // may be different numbers of them if e.g. debug information is present.
    1088             :     if (!MI.isTransient())
    1089        3021 :       ++Count;
    1090             : 
    1091             :     // If the load instruction reads directly from the address to which the
    1092             :     // store instruction writes and the stored value is not modified, we can
    1093             :     // promote the load. Since we do not handle stores with pre-/post-index,
    1094             :     // it's unnecessary to check if BaseReg is modified by the store itself.
    1095        4008 :     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
    1096          96 :         BaseReg == getLdStBaseOp(MI).getReg() &&
    1097        3477 :         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
    1098             :         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
    1099          62 :       StoreI = MBBI;
    1100          62 :       return true;
    1101             :     }
    1102             : 
    1103        3246 :     if (MI.isCall())
    1104             :       return false;
    1105             : 
    1106             :     // Update modified / uses register lists.
    1107        3015 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1108             : 
    1109             :     // Otherwise, if the base register is modified, we have no match, so
    1110             :     // return early.
    1111        3015 :     if (ModifiedRegs[BaseReg])
    1112             :       return false;
    1113             : 
    1114             :     // If we encounter a store aliased with the load, return early.
    1115        2677 :     if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
    1116             :       return false;
    1117        2529 :   } while (MBBI != B && Count < Limit);
    1118             :   return false;
    1119             : }
    1120             : 
    1121             : // Returns true if FirstMI and MI are candidates for merging or pairing.
    1122             : // Otherwise, returns false.
    1123       16647 : static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
    1124             :                                        LdStPairFlags &Flags,
    1125             :                                        const AArch64InstrInfo *TII) {
    1126             :   // If this is volatile or if pairing is suppressed, not a candidate.
    1127       16647 :   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
    1128             :     return false;
    1129             : 
    1130             :   // We should have already checked FirstMI for pair suppression and volatility.
    1131             :   assert(!FirstMI.hasOrderedMemoryRef() &&
    1132             :          !TII->isLdStPairSuppressed(FirstMI) &&
    1133             :          "FirstMI shouldn't get here if either of these checks are true.");
    1134             : 
    1135             :   unsigned OpcA = FirstMI.getOpcode();
    1136             :   unsigned OpcB = MI.getOpcode();
    1137             : 
    1138             :   // Opcodes match: nothing more to check.
    1139       12629 :   if (OpcA == OpcB)
    1140             :     return true;
    1141             : 
    1142             :   // Try to match a sign-extended load/store with a zero-extended load/store.
    1143             :   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
    1144       10068 :   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
    1145             :   assert(IsValidLdStrOpc &&
    1146             :          "Given Opc should be a Load or Store with an immediate");
    1147             :   // OpcA will be the first instruction in the pair.
    1148       10068 :   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
    1149           6 :     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
    1150             :     return true;
    1151             :   }
    1152             : 
    1153             :   // If the second instruction isn't even a mergable/pairable load/store, bail
    1154             :   // out.
    1155       10062 :   if (!PairIsValidLdStrOpc)
    1156             :     return false;
    1157             : 
    1158             :   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
    1159             :   // offsets.
    1160             :   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
    1161             :     return false;
    1162             : 
    1163             :   // Try to match an unscaled load/store with a scaled load/store.
    1164        2249 :   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
    1165         129 :          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
    1166             : 
    1167             :   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
    1168             : }
    1169             : 
    1170             : /// Scan the instructions looking for a load/store that can be combined with the
    1171             : /// current instruction into a wider equivalent or a load/store pair.
    1172             : MachineBasicBlock::iterator
    1173        5659 : AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    1174             :                                       LdStPairFlags &Flags, unsigned Limit,
    1175             :                                       bool FindNarrowMerge) {
    1176             :   MachineBasicBlock::iterator E = I->getParent()->end();
    1177        5659 :   MachineBasicBlock::iterator MBBI = I;
    1178             :   MachineInstr &FirstMI = *I;
    1179             :   ++MBBI;
    1180             : 
    1181        5659 :   bool MayLoad = FirstMI.mayLoad();
    1182             :   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
    1183             :   unsigned Reg = getLdStRegOp(FirstMI).getReg();
    1184             :   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
    1185        5659 :   int Offset = getLdStOffsetOp(FirstMI).getImm();
    1186        5659 :   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
    1187             :   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
    1188             : 
    1189             :   // Track which registers have been modified and used between the first insn
    1190             :   // (inclusive) and the second insn.
    1191             :   ModifiedRegs.reset();
    1192             :   UsedRegs.reset();
    1193             : 
    1194             :   // Remember any instructions that read/write memory between FirstMI and MI.
    1195             :   SmallVector<MachineInstr *, 4> MemInsns;
    1196             : 
    1197       19775 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1198             :     MachineInstr &MI = *MBBI;
    1199             : 
    1200             :     // Don't count transient instructions towards the search limit since there
    1201             :     // may be different numbers of them if e.g. debug information is present.
    1202             :     if (!MI.isTransient())
    1203       15877 :       ++Count;
    1204             : 
    1205             :     Flags.setSExtIdx(-1);
    1206       19262 :     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
    1207             :         getLdStOffsetOp(MI).isImm()) {
    1208             :       assert(MI.mayLoadOrStore() && "Expected memory operation.");
    1209             :       // If we've found another instruction with the same opcode, check to see
    1210             :       // if the base and offset are compatible with our starting instruction.
    1211             :       // These instructions all have scaled immediate operands, so we just
    1212             :       // check for +1/-1. Make sure to check the new instruction offset is
    1213             :       // actually an immediate and not a symbolic reference destined for
    1214             :       // a relocation.
    1215             :       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
    1216        2607 :       int MIOffset = getLdStOffsetOp(MI).getImm();
    1217             :       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
    1218        2607 :       if (IsUnscaled != MIIsUnscaled) {
    1219             :         // We're trying to pair instructions that differ in how they are scaled.
    1220             :         // If FirstMI is scaled then scale the offset of MI accordingly.
    1221             :         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
    1222          48 :         int MemSize = getMemScale(MI);
    1223          48 :         if (MIIsUnscaled) {
    1224             :           // If the unscaled offset isn't a multiple of the MemSize, we can't
    1225             :           // pair the operations together: bail and keep looking.
    1226          11 :           if (MIOffset % MemSize) {
    1227           2 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1228           2 :             MemInsns.push_back(&MI);
    1229           2 :             continue;
    1230             :           }
    1231           7 :           MIOffset /= MemSize;
    1232             :         } else {
    1233          39 :           MIOffset *= MemSize;
    1234             :         }
    1235             :       }
    1236             : 
    1237        3936 :       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
    1238        1331 :                                    (Offset + OffsetStride == MIOffset))) {
    1239        1056 :         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
    1240        1056 :         if (FindNarrowMerge) {
    1241             :           // If the alignment requirements of the scaled wide load/store
    1242             :           // instruction can't express the offset of the scaled narrow input,
    1243             :           // bail and keep looking. For promotable zero stores, allow only when
    1244             :           // the stored value is the same (i.e., WZR).
    1245          77 :           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
    1246          30 :               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
    1247          17 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1248          17 :             MemInsns.push_back(&MI);
    1249          17 :             continue;
    1250             :           }
    1251             :         } else {
    1252             :           // Pairwise instructions have a 7-bit signed offset field. Single
    1253             :           // insns have a 12-bit unsigned offset field.  If the resultant
    1254             :           // immediate offset of merging these instructions is out of range for
    1255             :           // a pairwise instruction, bail and keep looking.
    1256        1025 :           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
    1257           0 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1258           0 :             MemInsns.push_back(&MI);
    1259           0 :             continue;
    1260             :           }
    1261             :           // If the alignment requirements of the paired (scaled) instruction
    1262             :           // can't express the offset of the unscaled input, bail and keep
    1263             :           // looking.
    1264        1085 :           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
    1265           0 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1266           0 :             MemInsns.push_back(&MI);
    1267           0 :             continue;
    1268             :           }
    1269             :         }
    1270             :         // If the destination register of the loads is the same register, bail
    1271             :         // and keep looking. A load-pair instruction with both destination
    1272             :         // registers the same is UNPREDICTABLE and will result in an exception.
    1273        1075 :         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
    1274          36 :           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1275          36 :           MemInsns.push_back(&MI);
    1276          36 :           continue;
    1277             :         }
    1278             : 
    1279             :         // If the Rt of the second instruction was not modified or used between
    1280             :         // the two instructions and none of the instructions between the second
    1281             :         // and first alias with the second, we can combine the second into the
    1282             :         // first.
    1283         876 :         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
    1284        3190 :             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
    1285         844 :             !mayAlias(MI, MemInsns, AA)) {
    1286             :           Flags.setMergeForward(false);
    1287         827 :           return MBBI;
    1288             :         }
    1289             : 
    1290             :         // Likewise, if the Rt of the first instruction is not modified or used
    1291             :         // between the two instructions and none of the instructions between the
    1292             :         // first and the second alias with the first, we can combine the first
    1293             :         // into the second.
    1294          86 :         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
    1295         266 :             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
    1296          53 :             !mayAlias(FirstMI, MemInsns, AA)) {
    1297             :           Flags.setMergeForward(true);
    1298          39 :           return MBBI;
    1299             :         }
    1300             :         // Unable to combine these instructions due to interference in between.
    1301             :         // Keep looking.
    1302             :       }
    1303             :     }
    1304             : 
    1305             :     // If the instruction wasn't a matching load or store.  Stop searching if we
    1306             :     // encounter a call instruction that might modify memory.
    1307       15726 :     if (MI.isCall())
    1308         865 :       return E;
    1309             : 
    1310             :     // Update modified / uses register lists.
    1311       14861 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1312             : 
    1313             :     // Otherwise, if the base register is modified, we have no match, so
    1314             :     // return early.
    1315       14861 :     if (ModifiedRegs[BaseReg])
    1316         800 :       return E;
    1317             : 
    1318             :     // Update list of instructions that read/write memory.
    1319       14061 :     if (MI.mayLoadOrStore())
    1320        4419 :       MemInsns.push_back(&MI);
    1321             :   }
    1322        3128 :   return E;
    1323             : }
    1324             : 
    1325             : MachineBasicBlock::iterator
    1326         198 : AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
    1327             :                                      MachineBasicBlock::iterator Update,
    1328             :                                      bool IsPreIdx) {
    1329             :   assert((Update->getOpcode() == AArch64::ADDXri ||
    1330             :           Update->getOpcode() == AArch64::SUBXri) &&
    1331             :          "Unexpected base register update instruction to merge!");
    1332         198 :   MachineBasicBlock::iterator NextI = I;
    1333             :   // Return the instruction following the merged instruction, which is
    1334             :   // the instruction following our unmerged load. Unless that's the add/sub
    1335             :   // instruction we're merging, in which case it's the one after that.
    1336         198 :   if (++NextI == Update)
    1337             :     ++NextI;
    1338             : 
    1339         198 :   int Value = Update->getOperand(2).getImm();
    1340             :   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
    1341             :          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
    1342         198 :   if (Update->getOpcode() == AArch64::SUBXri)
    1343          50 :     Value = -Value;
    1344             : 
    1345         396 :   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
    1346             :                              : getPostIndexedOpcode(I->getOpcode());
    1347             :   MachineInstrBuilder MIB;
    1348         198 :   if (!isPairedLdSt(*I)) {
    1349             :     // Non-paired instruction.
    1350         332 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1351             :               .add(getLdStRegOp(*Update))
    1352             :               .add(getLdStRegOp(*I))
    1353             :               .add(getLdStBaseOp(*I))
    1354         166 :               .addImm(Value)
    1355             :               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
    1356             :   } else {
    1357             :     // Paired instruction.
    1358          32 :     int Scale = getMemScale(*I);
    1359          64 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1360             :               .add(getLdStRegOp(*Update))
    1361             :               .add(getLdStRegOp(*I, 0))
    1362             :               .add(getLdStRegOp(*I, 1))
    1363             :               .add(getLdStBaseOp(*I))
    1364          32 :               .addImm(Value / Scale)
    1365             :               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
    1366             :   }
    1367             :   (void)MIB;
    1368             : 
    1369             :   if (IsPreIdx) {
    1370             :     ++NumPreFolded;
    1371             :     DEBUG(dbgs() << "Creating pre-indexed load/store.");
    1372             :   } else {
    1373             :     ++NumPostFolded;
    1374             :     DEBUG(dbgs() << "Creating post-indexed load/store.");
    1375             :   }
    1376             :   DEBUG(dbgs() << "    Replacing instructions:\n    ");
    1377             :   DEBUG(I->print(dbgs()));
    1378             :   DEBUG(dbgs() << "    ");
    1379             :   DEBUG(Update->print(dbgs()));
    1380             :   DEBUG(dbgs() << "  with instruction:\n    ");
    1381             :   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    1382             :   DEBUG(dbgs() << "\n");
    1383             : 
    1384             :   // Erase the old instructions for the block.
    1385         198 :   I->eraseFromParent();
    1386         198 :   Update->eraseFromParent();
    1387             : 
    1388         198 :   return NextI;
    1389             : }
    1390             : 
    1391       31743 : bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
    1392             :                                                MachineInstr &MI,
    1393             :                                                unsigned BaseReg, int Offset) {
    1394       31743 :   switch (MI.getOpcode()) {
    1395             :   default:
    1396             :     break;
    1397        1471 :   case AArch64::SUBXri:
    1398             :   case AArch64::ADDXri:
    1399             :     // Make sure it's a vanilla immediate operand, not a relocation or
    1400             :     // anything else we can't handle.
    1401        1471 :     if (!MI.getOperand(2).isImm())
    1402             :       break;
    1403             :     // Watch out for 1 << 12 shifted value.
    1404        2704 :     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
    1405             :       break;
    1406             : 
    1407             :     // The update instruction source and destination register must be the
    1408             :     // same as the load/store base register.
    1409        1336 :     if (MI.getOperand(0).getReg() != BaseReg ||
    1410             :         MI.getOperand(1).getReg() != BaseReg)
    1411             :       break;
    1412             : 
    1413         531 :     bool IsPairedInsn = isPairedLdSt(MemMI);
    1414         531 :     int UpdateOffset = MI.getOperand(2).getImm();
    1415         531 :     if (MI.getOpcode() == AArch64::SUBXri)
    1416          52 :       UpdateOffset = -UpdateOffset;
    1417             : 
    1418             :     // For non-paired load/store instructions, the immediate must fit in a
    1419             :     // signed 9-bit integer.
    1420         531 :     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
    1421             :       break;
    1422             : 
    1423             :     // For paired load/store instructions, the immediate must be a multiple of
    1424             :     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
    1425             :     // integer.
    1426         522 :     if (IsPairedInsn) {
    1427         172 :       int Scale = getMemScale(MemMI);
    1428         172 :       if (UpdateOffset % Scale != 0)
    1429             :         break;
    1430             : 
    1431         172 :       int ScaledOffset = UpdateOffset / Scale;
    1432         172 :       if (ScaledOffset > 63 || ScaledOffset < -64)
    1433             :         break;
    1434             :     }
    1435             : 
    1436             :     // If we have a non-zero Offset, we check that it matches the amount
    1437             :     // we're adding to the register.
    1438         514 :     if (!Offset || Offset == UpdateOffset)
    1439             :       return true;
    1440             :     break;
    1441             :   }
    1442             :   return false;
    1443             : }
    1444             : 
    1445       16009 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
    1446             :     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
    1447             :   MachineBasicBlock::iterator E = I->getParent()->end();
    1448             :   MachineInstr &MemMI = *I;
    1449       16009 :   MachineBasicBlock::iterator MBBI = I;
    1450             : 
    1451             :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1452       16009 :   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
    1453             : 
    1454             :   // Scan forward looking for post-index opportunities.  Updating instructions
    1455             :   // can't be formed if the memory instruction doesn't have the offset we're
    1456             :   // looking for.
    1457       16009 :   if (MIUnscaledOffset != UnscaledOffset)
    1458        4141 :     return E;
    1459             : 
    1460             :   // If the base register overlaps a destination register, we can't
    1461             :   // merge the update.
    1462             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1463       40294 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1464             :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1465       28805 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1466         256 :       return E;
    1467             :   }
    1468             : 
    1469             :   // Track which registers have been modified and used between the first insn
    1470             :   // (inclusive) and the second insn.
    1471             :   ModifiedRegs.reset();
    1472             :   UsedRegs.reset();
    1473             :   ++MBBI;
    1474       32155 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1475             :     MachineInstr &MI = *MBBI;
    1476             : 
    1477             :     // Don't count transient instructions towards the search limit since there
    1478             :     // may be different numbers of them if e.g. debug information is present.
    1479             :     if (!MI.isTransient())
    1480       24806 :       ++Count;
    1481             : 
    1482             :     // If we found a match, return it.
    1483       26088 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
    1484         146 :       return MBBI;
    1485             : 
    1486             :     // Update the status of what the instruction clobbered and used.
    1487       25942 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1488             : 
    1489             :     // Otherwise, if the base register is used or modified, we have no match, so
    1490             :     // return early.
    1491       50175 :     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    1492        5399 :       return E;
    1493             :   }
    1494        6067 :   return E;
    1495             : }
    1496             : 
    1497        7860 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
    1498             :     MachineBasicBlock::iterator I, unsigned Limit) {
    1499             :   MachineBasicBlock::iterator B = I->getParent()->begin();
    1500             :   MachineBasicBlock::iterator E = I->getParent()->end();
    1501             :   MachineInstr &MemMI = *I;
    1502        7860 :   MachineBasicBlock::iterator MBBI = I;
    1503             : 
    1504             :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1505        7860 :   int Offset = getLdStOffsetOp(MemMI).getImm();
    1506             : 
    1507             :   // If the load/store is the first instruction in the block, there's obviously
    1508             :   // not any matching update. Ditto if the memory offset isn't zero.
    1509        7860 :   if (MBBI == B || Offset != 0)
    1510        5264 :     return E;
    1511             :   // If the base register overlaps a destination register, we can't
    1512             :   // merge the update.
    1513             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1514        8104 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1515             :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1516        5611 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1517          75 :       return E;
    1518             :   }
    1519             : 
    1520             :   // Track which registers have been modified and used between the first insn
    1521             :   // (inclusive) and the second insn.
    1522             :   ModifiedRegs.reset();
    1523             :   UsedRegs.reset();
    1524             :   unsigned Count = 0;
    1525             :   do {
    1526             :     --MBBI;
    1527             :     MachineInstr &MI = *MBBI;
    1528             : 
    1529             :     // Don't count transient instructions towards the search limit since there
    1530             :     // may be different numbers of them if e.g. debug information is present.
    1531             :     if (!MI.isTransient())
    1532        4945 :       ++Count;
    1533             : 
    1534             :     // If we found a match, return it.
    1535        5655 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
    1536          52 :       return MBBI;
    1537             : 
    1538             :     // Update the status of what the instruction clobbered and used.
    1539        5603 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1540             : 
    1541             :     // Otherwise, if the base register is used or modified, we have no match, so
    1542             :     // return early.
    1543       10738 :     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    1544        1019 :       return E;
    1545        7718 :   } while (MBBI != B && Count < Limit);
    1546        1450 :   return E;
    1547             : }
    1548             : 
    1549        2205 : bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
    1550             :     MachineBasicBlock::iterator &MBBI) {
    1551             :   MachineInstr &MI = *MBBI;
    1552             :   // If this is a volatile load, don't mess with it.
    1553        2205 :   if (MI.hasOrderedMemoryRef())
    1554             :     return false;
    1555             : 
    1556             :   // Make sure this is a reg+imm.
    1557             :   // FIXME: It is possible to extend it to handle reg+reg cases.
    1558        1531 :   if (!getLdStOffsetOp(MI).isImm())
    1559             :     return false;
    1560             : 
    1561             :   // Look backward up to LdStLimit instructions.
    1562             :   MachineBasicBlock::iterator StoreI;
    1563        1262 :   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
    1564             :     ++NumLoadsFromStoresPromoted;
    1565             :     // Promote the load. Keeping the iterator straight is a
    1566             :     // pain, so we let the merge routine tell us what the next instruction
    1567             :     // is after it's done mucking about.
    1568          62 :     MBBI = promoteLoadFromStore(MBBI, StoreI);
    1569          62 :     return true;
    1570             :   }
    1571             :   return false;
    1572             : }
    1573             : 
    1574             : // Merge adjacent zero stores into a wider store.
    1575         105 : bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
    1576             :     MachineBasicBlock::iterator &MBBI) {
    1577             :   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
    1578             :   MachineInstr &MI = *MBBI;
    1579             :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1580             : 
    1581         105 :   if (!TII->isCandidateToMergeOrPair(MI))
    1582             :     return false;
    1583             : 
    1584             :   // Look ahead up to LdStLimit instructions for a mergable instruction.
    1585          96 :   LdStPairFlags Flags;
    1586             :   MachineBasicBlock::iterator MergeMI =
    1587          96 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
    1588          96 :   if (MergeMI != E) {
    1589             :     ++NumZeroStoresPromoted;
    1590             : 
    1591             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1592             :     // us what the next instruction is after it's done mucking about.
    1593          14 :     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
    1594          14 :     return true;
    1595             :   }
    1596             :   return false;
    1597             : }
    1598             : 
    1599             : // Find loads and stores that can be merged into a single load or store pair
    1600             : // instruction.
    1601        7974 : bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
    1602             :   MachineInstr &MI = *MBBI;
    1603             :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1604             : 
    1605        7974 :   if (!TII->isCandidateToMergeOrPair(MI))
    1606             :     return false;
    1607             : 
    1608             :   // Early exit if the offset is not possible to match. (6 bits of positive
    1609             :   // range, plus allow an extra one in case we find a later insn that matches
    1610             :   // with Offset-1)
    1611             :   bool IsUnscaled = TII->isUnscaledLdSt(MI);
    1612        5674 :   int Offset = getLdStOffsetOp(MI).getImm();
    1613        5674 :   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
    1614             :   // Allow one more for offset.
    1615        5674 :   if (Offset > 0)
    1616        2024 :     Offset -= OffsetStride;
    1617        5596 :   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    1618             :     return false;
    1619             : 
    1620             :   // Look ahead up to LdStLimit instructions for a pairable instruction.
    1621        5563 :   LdStPairFlags Flags;
    1622             :   MachineBasicBlock::iterator Paired =
    1623        5563 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
    1624        5563 :   if (Paired != E) {
    1625             :     ++NumPairCreated;
    1626             :     if (TII->isUnscaledLdSt(MI))
    1627             :       ++NumUnscaledPairCreated;
    1628             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1629             :     // us what the next instruction is after it's done mucking about.
    1630         852 :     MBBI = mergePairedInsns(MBBI, Paired, Flags);
    1631         852 :     return true;
    1632             :   }
    1633             :   return false;
    1634             : }
    1635             : 
    1636        8201 : bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
    1637             :     (MachineBasicBlock::iterator &MBBI) {
    1638             :   MachineInstr &MI = *MBBI;
    1639             :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1640             :   MachineBasicBlock::iterator Update;
    1641             : 
    1642             :   // Look forward to try to form a post-index instruction. For example,
    1643             :   // ldr x0, [x20]
    1644             :   // add x20, x20, #32
    1645             :   //   merged into:
    1646             :   // ldr x0, [x20], #32
    1647        8201 :   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
    1648        8201 :   if (Update != E) {
    1649             :     // Merge the update into the ld/st.
    1650         109 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
    1651         109 :     return true;
    1652             :   }
    1653             : 
    1654             :   // Don't know how to handle unscaled pre/post-index versions below, so bail.
    1655        8092 :   if (TII->isUnscaledLdSt(MI.getOpcode()))
    1656             :     return false;
    1657             : 
    1658             :   // Look back to try to find a pre-index instruction. For example,
    1659             :   // add x0, x0, #8
    1660             :   // ldr x1, [x0]
    1661             :   //   merged into:
    1662             :   // ldr x1, [x0, #8]!
    1663        7860 :   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
    1664        7860 :   if (Update != E) {
    1665             :     // Merge the update into the ld/st.
    1666          52 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1667          52 :     return true;
    1668             :   }
    1669             : 
    1670             :   // The immediate in the load/store is scaled by the size of the memory
    1671             :   // operation. The immediate in the add we're looking for,
    1672             :   // however, is not, so adjust here.
    1673        7808 :   int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
    1674             : 
    1675             :   // Look forward to try to find a post-index instruction. For example,
    1676             :   // ldr x1, [x0, #64]
    1677             :   // add x0, x0, #64
    1678             :   //   merged into:
    1679             :   // ldr x1, [x0, #64]!
    1680        7808 :   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
    1681        7808 :   if (Update != E) {
    1682             :     // Merge the update into the ld/st.
    1683          37 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1684          37 :     return true;
    1685             :   }
    1686             : 
    1687             :   return false;
    1688             : }
    1689             : 
    1690       14322 : bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
    1691             :                                         bool EnableNarrowZeroStOpt) {
    1692             :   bool Modified = false;
    1693             :   // Four tranformations to do here:
    1694             :   // 1) Find loads that directly read from stores and promote them by
    1695             :   //    replacing with mov instructions. If the store is wider than the load,
    1696             :   //    the load will be replaced with a bitfield extract.
    1697             :   //      e.g.,
    1698             :   //        str w1, [x0, #4]
    1699             :   //        ldrh w2, [x0, #6]
    1700             :   //        ; becomes
    1701             :   //        str w1, [x0, #4]
    1702             :   //        lsr w2, w1, #16
    1703       14322 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1704       79696 :        MBBI != E;) {
    1705       65374 :     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
    1706             :       Modified = true;
    1707             :     else
    1708             :       ++MBBI;
    1709             :   }
    1710             :   // 2) Merge adjacent zero stores into a wider store.
    1711             :   //      e.g.,
    1712             :   //        strh wzr, [x0]
    1713             :   //        strh wzr, [x0, #2]
    1714             :   //        ; becomes
    1715             :   //        str wzr, [x0]
    1716             :   //      e.g.,
    1717             :   //        str wzr, [x0]
    1718             :   //        str wzr, [x0, #4]
    1719             :   //        ; becomes
    1720             :   //        str xzr, [x0]
    1721       14322 :   if (EnableNarrowZeroStOpt)
    1722       14139 :     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1723       78774 :          MBBI != E;) {
    1724         105 :       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
    1725             :         Modified = true;
    1726             :       else
    1727             :         ++MBBI;
    1728             :     }
    1729             :   // 3) Find loads and stores that can be merged into a single load or store
    1730             :   //    pair instruction.
    1731             :   //      e.g.,
    1732             :   //        ldr x0, [x2]
    1733             :   //        ldr x1, [x2, #8]
    1734             :   //        ; becomes
    1735             :   //        ldp x0, x1, [x2]
    1736       14322 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1737       78867 :        MBBI != E;) {
    1738       64545 :     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
    1739             :       Modified = true;
    1740             :     else
    1741             :       ++MBBI;
    1742             :   }
    1743             :   // 4) Find base register updates that can be merged into the load or store
    1744             :   //    as a base-reg writeback.
    1745             :   //      e.g.,
    1746             :   //        ldr x0, [x2]
    1747             :   //        add x2, x2, #4
    1748             :   //        ; becomes
    1749             :   //        ldr x0, [x2], #4
    1750       14322 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1751       78694 :        MBBI != E;) {
    1752       64372 :     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
    1753             :       Modified = true;
    1754             :     else
    1755             :       ++MBBI;
    1756             :   }
    1757             : 
    1758       14322 :   return Modified;
    1759             : }
    1760             : 
    1761       12295 : bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
    1762       12295 :   if (skipFunction(Fn.getFunction()))
    1763             :     return false;
    1764             : 
    1765       12294 :   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
    1766       12294 :   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
    1767       12294 :   TRI = Subtarget->getRegisterInfo();
    1768       24588 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
    1769             : 
    1770             :   // Resize the modified and used register bitfield trackers.  We do this once
    1771             :   // per function and then clear the bitfield each time we optimize a load or
    1772             :   // store.
    1773       12294 :   ModifiedRegs.resize(TRI->getNumRegs());
    1774       12294 :   UsedRegs.resize(TRI->getNumRegs());
    1775             : 
    1776             :   bool Modified = false;
    1777       12294 :   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
    1778       26616 :   for (auto &MBB : Fn)
    1779       14322 :     Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
    1780             : 
    1781             :   return Modified;
    1782             : }
    1783             : 
    1784             : // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
    1785             : // stores near one another?  Note: The pre-RA instruction scheduler already has
    1786             : // hooks to try and schedule pairable loads/stores together to improve pairing
    1787             : // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
    1788             : 
    1789             : // FIXME: When pairing store instructions it's very possible for this pass to
    1790             : // hoist a store with a KILL marker above another use (without a KILL marker).
    1791             : // The resulting IR is invalid, but nothing uses the KILL markers after this
    1792             : // pass, so it's never caused a problem in practice.
    1793             : 
    1794             : /// createAArch64LoadStoreOptimizationPass - returns an instance of the
    1795             : /// load / store optimization pass.
    1796         959 : FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
    1797         959 :   return new AArch64LoadStoreOpt();
    1798      291951 : }

Generated by: LCOV version 1.13