LCOV - code coverage report
Current view: top level - lib/Target/AArch64 - AArch64LoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 566 614 92.2 %
Date: 2018-06-17 00:07:59 Functions: 38 38 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file contains a pass that performs load / store related peephole
      11             : // optimizations. This pass should be run after register allocation.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AArch64InstrInfo.h"
      16             : #include "AArch64Subtarget.h"
      17             : #include "MCTargetDesc/AArch64AddressingModes.h"
      18             : #include "llvm/ADT/BitVector.h"
      19             : #include "llvm/ADT/SmallVector.h"
      20             : #include "llvm/ADT/Statistic.h"
      21             : #include "llvm/ADT/StringRef.h"
      22             : #include "llvm/ADT/iterator_range.h"
      23             : #include "llvm/Analysis/AliasAnalysis.h"
      24             : #include "llvm/CodeGen/MachineBasicBlock.h"
      25             : #include "llvm/CodeGen/MachineFunction.h"
      26             : #include "llvm/CodeGen/MachineFunctionPass.h"
      27             : #include "llvm/CodeGen/MachineInstr.h"
      28             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      29             : #include "llvm/CodeGen/MachineOperand.h"
      30             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      31             : #include "llvm/IR/DebugLoc.h"
      32             : #include "llvm/MC/MCRegisterInfo.h"
      33             : #include "llvm/Pass.h"
      34             : #include "llvm/Support/CommandLine.h"
      35             : #include "llvm/Support/Debug.h"
      36             : #include "llvm/Support/ErrorHandling.h"
      37             : #include "llvm/Support/raw_ostream.h"
      38             : #include <cassert>
      39             : #include <cstdint>
      40             : #include <iterator>
      41             : #include <limits>
      42             : 
      43             : using namespace llvm;
      44             : 
      45             : #define DEBUG_TYPE "aarch64-ldst-opt"
      46             : 
      47             : STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
      48             : STATISTIC(NumPostFolded, "Number of post-index updates folded");
      49             : STATISTIC(NumPreFolded, "Number of pre-index updates folded");
      50             : STATISTIC(NumUnscaledPairCreated,
      51             :           "Number of load/store from unscaled generated");
      52             : STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
      53             : STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
      54             : 
      55             : // The LdStLimit limits how far we search for load/store pairs.
      56      101169 : static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
      57      101169 :                                    cl::init(20), cl::Hidden);
      58             : 
      59             : // The UpdateLimit limits how far we search for update instructions when we form
      60             : // pre-/post-index instructions.
      61      303507 : static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
      62      303507 :                                      cl::Hidden);
      63             : 
      64             : #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
      65             : 
      66             : namespace {
      67             : 
      68             : using LdStPairFlags = struct LdStPairFlags {
      69             :   // If a matching instruction is found, MergeForward is set to true if the
      70             :   // merge is to remove the first instruction and replace the second with
      71             :   // a pair-wise insn, and false if the reverse is true.
      72             :   bool MergeForward = false;
      73             : 
      74             :   // SExtIdx gives the index of the result of the load pair that must be
      75             :   // extended. The value of SExtIdx assumes that the paired load produces the
      76             :   // value in this order: (I, returned iterator), i.e., -1 means no value has
      77             :   // to be extended, 0 means I, and 1 means the returned iterator.
      78             :   int SExtIdx = -1;
      79             : 
      80             :   LdStPairFlags() = default;
      81             : 
      82         889 :   void setMergeForward(bool V = true) { MergeForward = V; }
      83             :   bool getMergeForward() const { return MergeForward; }
      84             : 
      85       17464 :   void setSExtIdx(int V) { SExtIdx = V; }
      86             :   int getSExtIdx() const { return SExtIdx; }
      87             : };
      88             : 
      89        3063 : struct AArch64LoadStoreOpt : public MachineFunctionPass {
      90             :   static char ID;
      91             : 
      92        1028 :   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
      93        1028 :     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
      94        1028 :   }
      95             : 
      96             :   AliasAnalysis *AA;
      97             :   const AArch64InstrInfo *TII;
      98             :   const TargetRegisterInfo *TRI;
      99             :   const AArch64Subtarget *Subtarget;
     100             : 
     101             :   // Track which register units have been modified and used.
     102             :   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
     103             : 
     104        1015 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     105             :     AU.addRequired<AAResultsWrapperPass>();
     106        1015 :     MachineFunctionPass::getAnalysisUsage(AU);
     107        1015 :   }
     108             : 
     109             :   // Scan the instructions looking for a load/store that can be combined
     110             :   // with the current instruction into a load/store pair.
     111             :   // Return the matching instruction if one is found, else MBB->end().
     112             :   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
     113             :                                                LdStPairFlags &Flags,
     114             :                                                unsigned Limit,
     115             :                                                bool FindNarrowMerge);
     116             : 
     117             :   // Scan the instructions looking for a store that writes to the address from
     118             :   // which the current load instruction reads. Return true if one is found.
     119             :   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
     120             :                          MachineBasicBlock::iterator &StoreI);
     121             : 
     122             :   // Merge the two instructions indicated into a wider narrow store instruction.
     123             :   MachineBasicBlock::iterator
     124             :   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     125             :                         MachineBasicBlock::iterator MergeMI,
     126             :                         const LdStPairFlags &Flags);
     127             : 
     128             :   // Merge the two instructions indicated into a single pair-wise instruction.
     129             :   MachineBasicBlock::iterator
     130             :   mergePairedInsns(MachineBasicBlock::iterator I,
     131             :                    MachineBasicBlock::iterator Paired,
     132             :                    const LdStPairFlags &Flags);
     133             : 
     134             :   // Promote the load that reads directly from the address stored to.
     135             :   MachineBasicBlock::iterator
     136             :   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     137             :                        MachineBasicBlock::iterator StoreI);
     138             : 
     139             :   // Scan the instruction list to find a base register update that can
     140             :   // be combined with the current instruction (a load or store) using
     141             :   // pre or post indexed addressing with writeback. Scan forwards.
     142             :   MachineBasicBlock::iterator
     143             :   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
     144             :                                 int UnscaledOffset, unsigned Limit);
     145             : 
     146             :   // Scan the instruction list to find a base register update that can
     147             :   // be combined with the current instruction (a load or store) using
     148             :   // pre or post indexed addressing with writeback. Scan backwards.
     149             :   MachineBasicBlock::iterator
     150             :   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
     151             : 
     152             :   // Find an instruction that updates the base register of the ld/st
     153             :   // instruction.
     154             :   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
     155             :                             unsigned BaseReg, int Offset);
     156             : 
     157             :   // Merge a pre- or post-index base register update into a ld/st instruction.
     158             :   MachineBasicBlock::iterator
     159             :   mergeUpdateInsn(MachineBasicBlock::iterator I,
     160             :                   MachineBasicBlock::iterator Update, bool IsPreIdx);
     161             : 
     162             :   // Find and merge zero store instructions.
     163             :   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
     164             : 
     165             :   // Find and pair ldr/str instructions.
     166             :   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
     167             : 
     168             :   // Find and promote load instructions which read directly from store.
     169             :   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
     170             : 
     171             :   // Find and merge a base register updates before or after a ld/st instruction.
     172             :   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
     173             : 
     174             :   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
     175             : 
     176             :   bool runOnMachineFunction(MachineFunction &Fn) override;
     177             : 
     178        1015 :   MachineFunctionProperties getRequiredProperties() const override {
     179        2030 :     return MachineFunctionProperties().set(
     180        1015 :         MachineFunctionProperties::Property::NoVRegs);
     181             :   }
     182             : 
     183        1022 :   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
     184             : };
     185             : 
     186             : char AArch64LoadStoreOpt::ID = 0;
     187             : 
     188             : } // end anonymous namespace
     189             : 
     190      359152 : INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
     191             :                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
     192             : 
     193             : static bool isNarrowStore(unsigned Opc) {
     194             :   switch (Opc) {
     195             :   default:
     196             :     return false;
     197             :   case AArch64::STRBBui:
     198             :   case AArch64::STURBBi:
     199             :   case AArch64::STRHHui:
     200             :   case AArch64::STURHHi:
     201             :     return true;
     202             :   }
     203             : }
     204             : 
     205             : // Scaling factor for unscaled load or store.
     206       26308 : static int getMemScale(MachineInstr &MI) {
     207       52616 :   switch (MI.getOpcode()) {
     208           0 :   default:
     209           0 :     llvm_unreachable("Opcode has unknown scale!");
     210             :   case AArch64::LDRBBui:
     211             :   case AArch64::LDURBBi:
     212             :   case AArch64::LDRSBWui:
     213             :   case AArch64::LDURSBWi:
     214             :   case AArch64::STRBBui:
     215             :   case AArch64::STURBBi:
     216             :     return 1;
     217         317 :   case AArch64::LDRHHui:
     218             :   case AArch64::LDURHHi:
     219             :   case AArch64::LDRSHWui:
     220             :   case AArch64::LDURSHWi:
     221             :   case AArch64::STRHHui:
     222             :   case AArch64::STURHHi:
     223         317 :     return 2;
     224        3638 :   case AArch64::LDRSui:
     225             :   case AArch64::LDURSi:
     226             :   case AArch64::LDRSWui:
     227             :   case AArch64::LDURSWi:
     228             :   case AArch64::LDRWui:
     229             :   case AArch64::LDURWi:
     230             :   case AArch64::STRSui:
     231             :   case AArch64::STURSi:
     232             :   case AArch64::STRWui:
     233             :   case AArch64::STURWi:
     234             :   case AArch64::LDPSi:
     235             :   case AArch64::LDPSWi:
     236             :   case AArch64::LDPWi:
     237             :   case AArch64::STPSi:
     238             :   case AArch64::STPWi:
     239        3638 :     return 4;
     240       16116 :   case AArch64::LDRDui:
     241             :   case AArch64::LDURDi:
     242             :   case AArch64::LDRXui:
     243             :   case AArch64::LDURXi:
     244             :   case AArch64::STRDui:
     245             :   case AArch64::STURDi:
     246             :   case AArch64::STRXui:
     247             :   case AArch64::STURXi:
     248             :   case AArch64::LDPDi:
     249             :   case AArch64::LDPXi:
     250             :   case AArch64::STPDi:
     251             :   case AArch64::STPXi:
     252       16116 :     return 8;
     253        5689 :   case AArch64::LDRQui:
     254             :   case AArch64::LDURQi:
     255             :   case AArch64::STRQui:
     256             :   case AArch64::STURQi:
     257             :   case AArch64::LDPQi:
     258             :   case AArch64::STPQi:
     259        5689 :     return 16;
     260             :   }
     261             : }
     262             : 
     263       21130 : static unsigned getMatchingNonSExtOpcode(unsigned Opc,
     264             :                                          bool *IsValidLdStrOpc = nullptr) {
     265       21130 :   if (IsValidLdStrOpc)
     266       21124 :     *IsValidLdStrOpc = true;
     267       21130 :   switch (Opc) {
     268        8373 :   default:
     269        8373 :     if (IsValidLdStrOpc)
     270        8373 :       *IsValidLdStrOpc = false;
     271             :     return std::numeric_limits<unsigned>::max();
     272             :   case AArch64::STRDui:
     273             :   case AArch64::STURDi:
     274             :   case AArch64::STRQui:
     275             :   case AArch64::STURQi:
     276             :   case AArch64::STRBBui:
     277             :   case AArch64::STURBBi:
     278             :   case AArch64::STRHHui:
     279             :   case AArch64::STURHHi:
     280             :   case AArch64::STRWui:
     281             :   case AArch64::STURWi:
     282             :   case AArch64::STRXui:
     283             :   case AArch64::STURXi:
     284             :   case AArch64::LDRDui:
     285             :   case AArch64::LDURDi:
     286             :   case AArch64::LDRQui:
     287             :   case AArch64::LDURQi:
     288             :   case AArch64::LDRWui:
     289             :   case AArch64::LDURWi:
     290             :   case AArch64::LDRXui:
     291             :   case AArch64::LDURXi:
     292             :   case AArch64::STRSui:
     293             :   case AArch64::STURSi:
     294             :   case AArch64::LDRSui:
     295             :   case AArch64::LDURSi:
     296             :     return Opc;
     297          16 :   case AArch64::LDRSWui:
     298          16 :     return AArch64::LDRWui;
     299          10 :   case AArch64::LDURSWi:
     300          10 :     return AArch64::LDURWi;
     301             :   }
     302             : }
     303             : 
     304             : static unsigned getMatchingWideOpcode(unsigned Opc) {
     305          15 :   switch (Opc) {
     306           0 :   default:
     307           0 :     llvm_unreachable("Opcode has no wide equivalent!");
     308             :   case AArch64::STRBBui:
     309             :     return AArch64::STRHHui;
     310           1 :   case AArch64::STRHHui:
     311             :     return AArch64::STRWui;
     312           0 :   case AArch64::STURBBi:
     313             :     return AArch64::STURHHi;
     314           0 :   case AArch64::STURHHi:
     315             :     return AArch64::STURWi;
     316           2 :   case AArch64::STURWi:
     317             :     return AArch64::STURXi;
     318          12 :   case AArch64::STRWui:
     319             :     return AArch64::STRXui;
     320             :   }
     321             : }
     322             : 
     323        1156 : static unsigned getMatchingPairOpcode(unsigned Opc) {
     324        1156 :   switch (Opc) {
     325           0 :   default:
     326           0 :     llvm_unreachable("Opcode has no pairwise equivalent!");
     327             :   case AArch64::STRSui:
     328             :   case AArch64::STURSi:
     329             :     return AArch64::STPSi;
     330          65 :   case AArch64::STRDui:
     331             :   case AArch64::STURDi:
     332          65 :     return AArch64::STPDi;
     333         183 :   case AArch64::STRQui:
     334             :   case AArch64::STURQi:
     335         183 :     return AArch64::STPQi;
     336         114 :   case AArch64::STRWui:
     337             :   case AArch64::STURWi:
     338         114 :     return AArch64::STPWi;
     339         220 :   case AArch64::STRXui:
     340             :   case AArch64::STURXi:
     341         220 :     return AArch64::STPXi;
     342          57 :   case AArch64::LDRSui:
     343             :   case AArch64::LDURSi:
     344          57 :     return AArch64::LDPSi;
     345         153 :   case AArch64::LDRDui:
     346             :   case AArch64::LDURDi:
     347         153 :     return AArch64::LDPDi;
     348         136 :   case AArch64::LDRQui:
     349             :   case AArch64::LDURQi:
     350         136 :     return AArch64::LDPQi;
     351          44 :   case AArch64::LDRWui:
     352             :   case AArch64::LDURWi:
     353          44 :     return AArch64::LDPWi;
     354         155 :   case AArch64::LDRXui:
     355             :   case AArch64::LDURXi:
     356         155 :     return AArch64::LDPXi;
     357          11 :   case AArch64::LDRSWui:
     358             :   case AArch64::LDURSWi:
     359          11 :     return AArch64::LDPSWi;
     360             :   }
     361             : }
     362             : 
     363         519 : static unsigned isMatchingStore(MachineInstr &LoadInst,
     364             :                                 MachineInstr &StoreInst) {
     365         519 :   unsigned LdOpc = LoadInst.getOpcode();
     366         519 :   unsigned StOpc = StoreInst.getOpcode();
     367         519 :   switch (LdOpc) {
     368           0 :   default:
     369           0 :     llvm_unreachable("Unsupported load instruction!");
     370          17 :   case AArch64::LDRBBui:
     371          17 :     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
     372          17 :            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     373          15 :   case AArch64::LDURBBi:
     374          15 :     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
     375          15 :            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     376          16 :   case AArch64::LDRHHui:
     377          16 :     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
     378          16 :            StOpc == AArch64::STRXui;
     379           7 :   case AArch64::LDURHHi:
     380           7 :     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
     381           7 :            StOpc == AArch64::STURXi;
     382          67 :   case AArch64::LDRWui:
     383          67 :     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     384           6 :   case AArch64::LDURWi:
     385           6 :     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     386         389 :   case AArch64::LDRXui:
     387         389 :     return StOpc == AArch64::STRXui;
     388           2 :   case AArch64::LDURXi:
     389           2 :     return StOpc == AArch64::STURXi;
     390             :   }
     391             : }
     392             : 
     393          89 : static unsigned getPreIndexedOpcode(unsigned Opc) {
     394             :   // FIXME: We don't currently support creating pre-indexed loads/stores when
     395             :   // the load or store is the unscaled version.  If we decide to perform such an
     396             :   // optimization in the future the cases for the unscaled loads/stores will
     397             :   // need to be added here.
     398          89 :   switch (Opc) {
     399           0 :   default:
     400           0 :     llvm_unreachable("Opcode has no pre-indexed equivalent!");
     401             :   case AArch64::STRSui:
     402             :     return AArch64::STRSpre;
     403           6 :   case AArch64::STRDui:
     404           6 :     return AArch64::STRDpre;
     405           8 :   case AArch64::STRQui:
     406           8 :     return AArch64::STRQpre;
     407           2 :   case AArch64::STRBBui:
     408           2 :     return AArch64::STRBBpre;
     409           2 :   case AArch64::STRHHui:
     410           2 :     return AArch64::STRHHpre;
     411           8 :   case AArch64::STRWui:
     412           8 :     return AArch64::STRWpre;
     413           9 :   case AArch64::STRXui:
     414           9 :     return AArch64::STRXpre;
     415           6 :   case AArch64::LDRSui:
     416           6 :     return AArch64::LDRSpre;
     417           6 :   case AArch64::LDRDui:
     418           6 :     return AArch64::LDRDpre;
     419           6 :   case AArch64::LDRQui:
     420           6 :     return AArch64::LDRQpre;
     421           2 :   case AArch64::LDRBBui:
     422           2 :     return AArch64::LDRBBpre;
     423           2 :   case AArch64::LDRHHui:
     424           2 :     return AArch64::LDRHHpre;
     425           6 :   case AArch64::LDRWui:
     426           6 :     return AArch64::LDRWpre;
     427           6 :   case AArch64::LDRXui:
     428           6 :     return AArch64::LDRXpre;
     429           0 :   case AArch64::LDRSWui:
     430           0 :     return AArch64::LDRSWpre;
     431           0 :   case AArch64::LDPSi:
     432           0 :     return AArch64::LDPSpre;
     433           0 :   case AArch64::LDPSWi:
     434           0 :     return AArch64::LDPSWpre;
     435           0 :   case AArch64::LDPDi:
     436           0 :     return AArch64::LDPDpre;
     437           0 :   case AArch64::LDPQi:
     438           0 :     return AArch64::LDPQpre;
     439           2 :   case AArch64::LDPWi:
     440           2 :     return AArch64::LDPWpre;
     441           0 :   case AArch64::LDPXi:
     442           0 :     return AArch64::LDPXpre;
     443           0 :   case AArch64::STPSi:
     444           0 :     return AArch64::STPSpre;
     445           0 :   case AArch64::STPDi:
     446           0 :     return AArch64::STPDpre;
     447           1 :   case AArch64::STPQi:
     448           1 :     return AArch64::STPQpre;
     449           2 :   case AArch64::STPWi:
     450           2 :     return AArch64::STPWpre;
     451           9 :   case AArch64::STPXi:
     452           9 :     return AArch64::STPXpre;
     453             :   }
     454             : }
     455             : 
     456         108 : static unsigned getPostIndexedOpcode(unsigned Opc) {
     457         108 :   switch (Opc) {
     458           0 :   default:
     459           0 :     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
     460             :   case AArch64::STRSui:
     461             :   case AArch64::STURSi:
     462             :     return AArch64::STRSpost;
     463           5 :   case AArch64::STRDui:
     464             :   case AArch64::STURDi:
     465           5 :     return AArch64::STRDpost;
     466          19 :   case AArch64::STRQui:
     467             :   case AArch64::STURQi:
     468          19 :     return AArch64::STRQpost;
     469           2 :   case AArch64::STRBBui:
     470           2 :     return AArch64::STRBBpost;
     471           2 :   case AArch64::STRHHui:
     472           2 :     return AArch64::STRHHpost;
     473           5 :   case AArch64::STRWui:
     474             :   case AArch64::STURWi:
     475           5 :     return AArch64::STRWpost;
     476           7 :   case AArch64::STRXui:
     477             :   case AArch64::STURXi:
     478           7 :     return AArch64::STRXpost;
     479           5 :   case AArch64::LDRSui:
     480             :   case AArch64::LDURSi:
     481           5 :     return AArch64::LDRSpost;
     482           6 :   case AArch64::LDRDui:
     483             :   case AArch64::LDURDi:
     484           6 :     return AArch64::LDRDpost;
     485          11 :   case AArch64::LDRQui:
     486             :   case AArch64::LDURQi:
     487          11 :     return AArch64::LDRQpost;
     488           2 :   case AArch64::LDRBBui:
     489           2 :     return AArch64::LDRBBpost;
     490           2 :   case AArch64::LDRHHui:
     491           2 :     return AArch64::LDRHHpost;
     492           5 :   case AArch64::LDRWui:
     493             :   case AArch64::LDURWi:
     494           5 :     return AArch64::LDRWpost;
     495          11 :   case AArch64::LDRXui:
     496             :   case AArch64::LDURXi:
     497          11 :     return AArch64::LDRXpost;
     498           0 :   case AArch64::LDRSWui:
     499           0 :     return AArch64::LDRSWpost;
     500           0 :   case AArch64::LDPSi:
     501           0 :     return AArch64::LDPSpost;
     502           1 :   case AArch64::LDPSWi:
     503           1 :     return AArch64::LDPSWpost;
     504           0 :   case AArch64::LDPDi:
     505           0 :     return AArch64::LDPDpost;
     506           0 :   case AArch64::LDPQi:
     507           0 :     return AArch64::LDPQpost;
     508           0 :   case AArch64::LDPWi:
     509           0 :     return AArch64::LDPWpost;
     510           5 :   case AArch64::LDPXi:
     511           5 :     return AArch64::LDPXpost;
     512           2 :   case AArch64::STPSi:
     513           2 :     return AArch64::STPSpost;
     514           4 :   case AArch64::STPDi:
     515           4 :     return AArch64::STPDpost;
     516           2 :   case AArch64::STPQi:
     517           2 :     return AArch64::STPQpost;
     518           2 :   case AArch64::STPWi:
     519           2 :     return AArch64::STPWpost;
     520           5 :   case AArch64::STPXi:
     521           5 :     return AArch64::STPXpost;
     522             :   }
     523             : }
     524             : 
     525       84639 : static bool isPairedLdSt(const MachineInstr &MI) {
     526      169278 :   switch (MI.getOpcode()) {
     527             :   default:
     528             :     return false;
     529       17959 :   case AArch64::LDPSi:
     530             :   case AArch64::LDPSWi:
     531             :   case AArch64::LDPDi:
     532             :   case AArch64::LDPQi:
     533             :   case AArch64::LDPWi:
     534             :   case AArch64::LDPXi:
     535             :   case AArch64::STPSi:
     536             :   case AArch64::STPDi:
     537             :   case AArch64::STPQi:
     538             :   case AArch64::STPWi:
     539             :   case AArch64::STPXi:
     540       17959 :     return true;
     541             :   }
     542             : }
     543             : 
     544             : static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
     545             :                                           unsigned PairedRegOp = 0) {
     546             :   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
     547       23983 :   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
     548       29777 :   return MI.getOperand(Idx);
     549             : }
     550             : 
     551             : static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
     552       36760 :   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
     553       28094 :   return MI.getOperand(Idx);
     554             : }
     555             : 
     556             : static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
     557       62945 :   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
     558       31816 :   return MI.getOperand(Idx);
     559             : }
     560             : 
     561          99 : static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
     562             :                                   MachineInstr &StoreInst,
     563             :                                   const AArch64InstrInfo *TII) {
     564             :   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
     565          99 :   int LoadSize = getMemScale(LoadInst);
     566          99 :   int StoreSize = getMemScale(StoreInst);
     567             :   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
     568          26 :                              ? getLdStOffsetOp(StoreInst).getImm()
     569         198 :                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
     570             :   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
     571          26 :                              ? getLdStOffsetOp(LoadInst).getImm()
     572         198 :                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
     573         193 :   return (UnscaledStOffset <= UnscaledLdOffset) &&
     574         193 :          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
     575             : }
     576             : 
     577             : static bool isPromotableZeroStoreInst(MachineInstr &MI) {
     578       73559 :   unsigned Opc = MI.getOpcode();
     579       73559 :   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
     580       74593 :           isNarrowStore(Opc)) &&
     581        1034 :          getLdStRegOp(MI).getReg() == AArch64::WZR;
     582             : }
     583             : 
     584       68452 : static bool isPromotableLoadFromStore(MachineInstr &MI) {
     585      136904 :   switch (MI.getOpcode()) {
     586             :   default:
     587             :     return false;
     588             :   // Scaled instructions.
     589        2396 :   case AArch64::LDRBBui:
     590             :   case AArch64::LDRHHui:
     591             :   case AArch64::LDRWui:
     592             :   case AArch64::LDRXui:
     593             :   // Unscaled instructions.
     594             :   case AArch64::LDURBBi:
     595             :   case AArch64::LDURHHi:
     596             :   case AArch64::LDURWi:
     597             :   case AArch64::LDURXi:
     598        2396 :     return true;
     599             :   }
     600             : }
     601             : 
     602       67428 : static bool isMergeableLdStUpdate(MachineInstr &MI) {
     603       67428 :   unsigned Opc = MI.getOpcode();
     604       67428 :   switch (Opc) {
     605             :   default:
     606             :     return false;
     607             :   // Scaled instructions.
     608             :   case AArch64::STRSui:
     609             :   case AArch64::STRDui:
     610             :   case AArch64::STRQui:
     611             :   case AArch64::STRXui:
     612             :   case AArch64::STRWui:
     613             :   case AArch64::STRHHui:
     614             :   case AArch64::STRBBui:
     615             :   case AArch64::LDRSui:
     616             :   case AArch64::LDRDui:
     617             :   case AArch64::LDRQui:
     618             :   case AArch64::LDRXui:
     619             :   case AArch64::LDRWui:
     620             :   case AArch64::LDRHHui:
     621             :   case AArch64::LDRBBui:
     622             :   // Unscaled instructions.
     623             :   case AArch64::STURSi:
     624             :   case AArch64::STURDi:
     625             :   case AArch64::STURQi:
     626             :   case AArch64::STURWi:
     627             :   case AArch64::STURXi:
     628             :   case AArch64::LDURSi:
     629             :   case AArch64::LDURDi:
     630             :   case AArch64::LDURQi:
     631             :   case AArch64::LDURWi:
     632             :   case AArch64::LDURXi:
     633             :   // Paired instructions.
     634             :   case AArch64::LDPSi:
     635             :   case AArch64::LDPSWi:
     636             :   case AArch64::LDPDi:
     637             :   case AArch64::LDPQi:
     638             :   case AArch64::LDPWi:
     639             :   case AArch64::LDPXi:
     640             :   case AArch64::STPSi:
     641             :   case AArch64::STPDi:
     642             :   case AArch64::STPQi:
     643             :   case AArch64::STPWi:
     644             :   case AArch64::STPXi:
     645             :     // Make sure this is a reg+imm (as opposed to an address reloc).
     646       10267 :     if (!getLdStOffsetOp(MI).isImm())
     647             :       return false;
     648             : 
     649        8682 :     return true;
     650             :   }
     651             : }
     652             : 
     653             : MachineBasicBlock::iterator
     654          15 : AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     655             :                                            MachineBasicBlock::iterator MergeMI,
     656             :                                            const LdStPairFlags &Flags) {
     657             :   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
     658             :          "Expected promotable zero stores.");
     659             : 
     660          15 :   MachineBasicBlock::iterator NextI = I;
     661             :   ++NextI;
     662             :   // If NextI is the second of the two instructions to be merged, we need
     663             :   // to skip one further. Either way we merge will invalidate the iterator,
     664             :   // and we don't need to scan the new instruction, as it's a pairwise
     665             :   // instruction, which we're not considering for further action anyway.
     666          15 :   if (NextI == MergeMI)
     667             :     ++NextI;
     668             : 
     669          15 :   unsigned Opc = I->getOpcode();
     670          15 :   bool IsScaled = !TII->isUnscaledLdSt(Opc);
     671          15 :   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
     672             : 
     673          15 :   bool MergeForward = Flags.getMergeForward();
     674             :   // Insert our new paired instruction after whichever of the paired
     675             :   // instructions MergeForward indicates.
     676          15 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
     677             :   // Also based on MergeForward is from where we copy the base register operand
     678             :   // so we get the flags compatible with the input code.
     679             :   const MachineOperand &BaseRegOp =
     680          15 :       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
     681             : 
     682             :   // Which register is Rt and which is Rt2 depends on the offset order.
     683             :   MachineInstr *RtMI;
     684          30 :   if (getLdStOffsetOp(*I).getImm() ==
     685          15 :       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
     686             :     RtMI = &*MergeMI;
     687             :   else
     688             :     RtMI = &*I;
     689             : 
     690          15 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     691             :   // Change the scaled offset from small to large type.
     692          15 :   if (IsScaled) {
     693             :     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
     694          13 :     OffsetImm /= 2;
     695             :   }
     696             : 
     697             :   // Construct the new instruction.
     698             :   DebugLoc DL = I->getDebugLoc();
     699          15 :   MachineBasicBlock *MBB = I->getParent();
     700             :   MachineInstrBuilder MIB;
     701          60 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
     702          15 :             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
     703             :             .add(BaseRegOp)
     704          15 :             .addImm(OffsetImm)
     705          15 :             .setMemRefs(I->mergeMemRefsWith(*MergeMI))
     706          15 :             .setMIFlags(I->mergeFlagsWith(*MergeMI));
     707             :   (void)MIB;
     708             : 
     709             :   LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
     710             :   LLVM_DEBUG(I->print(dbgs()));
     711             :   LLVM_DEBUG(dbgs() << "    ");
     712             :   LLVM_DEBUG(MergeMI->print(dbgs()));
     713             :   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
     714             :   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     715             :   LLVM_DEBUG(dbgs() << "\n");
     716             : 
     717             :   // Erase the old instructions.
     718          15 :   I->eraseFromParent();
     719          15 :   MergeMI->eraseFromParent();
     720          30 :   return NextI;
     721             : }
     722             : 
     723             : MachineBasicBlock::iterator
     724         874 : AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     725             :                                       MachineBasicBlock::iterator Paired,
     726             :                                       const LdStPairFlags &Flags) {
     727         874 :   MachineBasicBlock::iterator NextI = I;
     728             :   ++NextI;
     729             :   // If NextI is the second of the two instructions to be merged, we need
     730             :   // to skip one further. Either way we merge will invalidate the iterator,
     731             :   // and we don't need to scan the new instruction, as it's a pairwise
     732             :   // instruction, which we're not considering for further action anyway.
     733         874 :   if (NextI == Paired)
     734             :     ++NextI;
     735             : 
     736         874 :   int SExtIdx = Flags.getSExtIdx();
     737             :   unsigned Opc =
     738        1754 :       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
     739         874 :   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
     740         913 :   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
     741             : 
     742         874 :   bool MergeForward = Flags.getMergeForward();
     743             :   // Insert our new paired instruction after whichever of the paired
     744             :   // instructions MergeForward indicates.
     745         874 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
     746             :   // Also based on MergeForward is from where we copy the base register operand
     747             :   // so we get the flags compatible with the input code.
     748             :   const MachineOperand &BaseRegOp =
     749         874 :       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
     750             : 
     751         874 :   int Offset = getLdStOffsetOp(*I).getImm();
     752         874 :   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
     753        1748 :   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
     754         874 :   if (IsUnscaled != PairedIsUnscaled) {
     755             :     // We're trying to pair instructions that differ in how they are scaled.  If
     756             :     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
     757             :     // the opposite (i.e., make Paired's offset unscaled).
     758           9 :     int MemSize = getMemScale(*Paired);
     759           9 :     if (PairedIsUnscaled) {
     760             :       // If the unscaled offset isn't a multiple of the MemSize, we can't
     761             :       // pair the operations together.
     762             :       assert(!(PairedOffset % getMemScale(*Paired)) &&
     763             :              "Offset should be a multiple of the stride!");
     764           5 :       PairedOffset /= MemSize;
     765             :     } else {
     766           4 :       PairedOffset *= MemSize;
     767             :     }
     768             :   }
     769             : 
     770             :   // Which register is Rt and which is Rt2 depends on the offset order.
     771             :   MachineInstr *RtMI, *Rt2MI;
     772         874 :   if (Offset == PairedOffset + OffsetStride) {
     773             :     RtMI = &*Paired;
     774             :     Rt2MI = &*I;
     775             :     // Here we swapped the assumption made for SExtIdx.
     776             :     // I.e., we turn ldp I, Paired into ldp Paired, I.
     777             :     // Update the index accordingly.
     778         247 :     if (SExtIdx != -1)
     779           0 :       SExtIdx = (SExtIdx + 1) % 2;
     780             :   } else {
     781             :     RtMI = &*I;
     782             :     Rt2MI = &*Paired;
     783             :   }
     784         874 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     785             :   // Scale the immediate offset, if necessary.
     786        1748 :   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
     787             :     assert(!(OffsetImm % getMemScale(*RtMI)) &&
     788             :            "Unscaled offset cannot be scaled.");
     789          44 :     OffsetImm /= getMemScale(*RtMI);
     790             :   }
     791             : 
     792             :   // Construct the new instruction.
     793             :   MachineInstrBuilder MIB;
     794             :   DebugLoc DL = I->getDebugLoc();
     795         874 :   MachineBasicBlock *MBB = I->getParent();
     796         874 :   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
     797         874 :   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
     798             :   // Kill flags may become invalid when moving stores for pairing.
     799         874 :   if (RegOp0.isUse()) {
     800         425 :     if (!MergeForward) {
     801             :       // Clear kill flags on store if moving upwards. Example:
     802             :       //   STRWui %w0, ...
     803             :       //   USE %w1
     804             :       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
     805             :       RegOp0.setIsKill(false);
     806             :       RegOp1.setIsKill(false);
     807             :     } else {
     808             :       // Clear kill flags of the first stores register. Example:
     809             :       //   STRWui %w1, ...
     810             :       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
     811             :       //   STRW %w0
     812          27 :       unsigned Reg = getLdStRegOp(*I).getReg();
     813         160 :       for (MachineInstr &MI : make_range(std::next(I), Paired))
     814         106 :         MI.clearRegisterKills(Reg, TRI);
     815             :     }
     816             :   }
     817        1748 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
     818             :             .add(RegOp0)
     819             :             .add(RegOp1)
     820             :             .add(BaseRegOp)
     821         874 :             .addImm(OffsetImm)
     822         874 :             .setMemRefs(I->mergeMemRefsWith(*Paired))
     823         874 :             .setMIFlags(I->mergeFlagsWith(*Paired));
     824             : 
     825             :   (void)MIB;
     826             : 
     827             :   LLVM_DEBUG(
     828             :       dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
     829             :   LLVM_DEBUG(I->print(dbgs()));
     830             :   LLVM_DEBUG(dbgs() << "    ");
     831             :   LLVM_DEBUG(Paired->print(dbgs()));
     832             :   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
     833         874 :   if (SExtIdx != -1) {
     834             :     // Generate the sign extension for the proper result of the ldp.
     835             :     // I.e., with X1, that would be:
     836             :     // %w1 = KILL %w1, implicit-def %x1
     837             :     // %x1 = SBFMXri killed %x1, 0, 31
     838           6 :     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
     839             :     // Right now, DstMO has the extended register, since it comes from an
     840             :     // extended opcode.
     841           6 :     unsigned DstRegX = DstMO.getReg();
     842             :     // Get the W variant of that register.
     843           6 :     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
     844             :     // Update the result of LDP to use the W instead of the X variant.
     845           6 :     DstMO.setReg(DstRegW);
     846             :     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     847             :     LLVM_DEBUG(dbgs() << "\n");
     848             :     // Make the machine verifier happy by providing a definition for
     849             :     // the X register.
     850             :     // Insert this definition right after the generated LDP, i.e., before
     851             :     // InsertionPoint.
     852             :     MachineInstrBuilder MIBKill =
     853          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
     854           6 :             .addReg(DstRegW)
     855           6 :             .addReg(DstRegX, RegState::Define);
     856           6 :     MIBKill->getOperand(2).setImplicit();
     857             :     // Create the sign extension.
     858             :     MachineInstrBuilder MIBSXTW =
     859          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
     860           6 :             .addReg(DstRegX)
     861             :             .addImm(0)
     862             :             .addImm(31);
     863             :     (void)MIBSXTW;
     864             :     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
     865             :     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
     866             :   } else {
     867             :     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     868             :   }
     869             :   LLVM_DEBUG(dbgs() << "\n");
     870             : 
     871             :   // Erase the old instructions.
     872         874 :   I->eraseFromParent();
     873         874 :   Paired->eraseFromParent();
     874             : 
     875        1748 :   return NextI;
     876             : }
     877             : 
     878             : MachineBasicBlock::iterator
     879          65 : AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     880             :                                           MachineBasicBlock::iterator StoreI) {
     881          65 :   MachineBasicBlock::iterator NextI = LoadI;
     882             :   ++NextI;
     883             : 
     884          65 :   int LoadSize = getMemScale(*LoadI);
     885          65 :   int StoreSize = getMemScale(*StoreI);
     886          65 :   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
     887             :   const MachineOperand &StMO = getLdStRegOp(*StoreI);
     888          65 :   unsigned StRt = getLdStRegOp(*StoreI).getReg();
     889         130 :   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
     890             : 
     891             :   assert((IsStoreXReg ||
     892             :           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
     893             :          "Unexpected RegClass");
     894             : 
     895             :   MachineInstr *BitExtMI;
     896          65 :   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
     897             :     // Remove the load, if the destination register of the loads is the same
     898             :     // register for stored value.
     899          13 :     if (StRt == LdRt && LoadSize == 8) {
     900             :       for (MachineInstr &MI : make_range(StoreI->getIterator(),
     901           2 :                                          LoadI->getIterator())) {
     902           6 :         if (MI.killsRegister(StRt, TRI)) {
     903           2 :           MI.clearRegisterKills(StRt, TRI);
     904           2 :           break;
     905             :         }
     906             :       }
     907             :       LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
     908             :       LLVM_DEBUG(LoadI->print(dbgs()));
     909             :       LLVM_DEBUG(dbgs() << "\n");
     910           2 :       LoadI->eraseFromParent();
     911           2 :       return NextI;
     912             :     }
     913             :     // Replace the load with a mov if the load and store are in the same size.
     914          11 :     BitExtMI =
     915          22 :         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     916          11 :                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
     917          11 :             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
     918             :             .add(StMO)
     919             :             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
     920          22 :             .setMIFlags(LoadI->getFlags());
     921             :   } else {
     922             :     // FIXME: Currently we disable this transformation in big-endian targets as
     923             :     // performance and correctness are verified only in little-endian.
     924          52 :     if (!Subtarget->isLittleEndian())
     925           0 :       return NextI;
     926             :     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
     927             :     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
     928             :            "Unsupported ld/st match");
     929             :     assert(LoadSize <= StoreSize && "Invalid load size");
     930             :     int UnscaledLdOffset = IsUnscaled
     931          24 :                                ? getLdStOffsetOp(*LoadI).getImm()
     932         104 :                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
     933             :     int UnscaledStOffset = IsUnscaled
     934          24 :                                ? getLdStOffsetOp(*StoreI).getImm()
     935         104 :                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     936          52 :     int Width = LoadSize * 8;
     937          52 :     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     938          52 :     int Imms = Immr + Width - 1;
     939             :     unsigned DestReg = IsStoreXReg
     940          52 :                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
     941             :                                                       &AArch64::GPR64RegClass)
     942             :                            : LdRt;
     943             : 
     944             :     assert((UnscaledLdOffset >= UnscaledStOffset &&
     945             :             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
     946             :            "Invalid offset");
     947             : 
     948             :     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     949             :     Imms = Immr + Width - 1;
     950          52 :     if (UnscaledLdOffset == UnscaledStOffset) {
     951          34 :       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
     952          17 :                                 | ((Immr) << 6)               // immr
     953          17 :                                 | ((Imms) << 0)               // imms
     954             :           ;
     955             : 
     956             :       BitExtMI =
     957          17 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     958          17 :                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
     959          17 :                   DestReg)
     960             :               .add(StMO)
     961          17 :               .addImm(AndMaskEncoded)
     962          17 :               .setMIFlags(LoadI->getFlags());
     963             :     } else {
     964             :       BitExtMI =
     965          35 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     966          35 :                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
     967          35 :                   DestReg)
     968             :               .add(StMO)
     969          35 :               .addImm(Immr)
     970          35 :               .addImm(Imms)
     971          35 :               .setMIFlags(LoadI->getFlags());
     972             :     }
     973             :   }
     974             : 
     975             :   // Clear kill flags between store and load.
     976             :   for (MachineInstr &MI : make_range(StoreI->getIterator(),
     977          63 :                                      BitExtMI->getIterator()))
     978         130 :     if (MI.killsRegister(StRt, TRI)) {
     979          58 :       MI.clearRegisterKills(StRt, TRI);
     980          58 :       break;
     981             :     }
     982             : 
     983             :   LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
     984             :   LLVM_DEBUG(StoreI->print(dbgs()));
     985             :   LLVM_DEBUG(dbgs() << "    ");
     986             :   LLVM_DEBUG(LoadI->print(dbgs()));
     987             :   LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
     988             :   LLVM_DEBUG(StoreI->print(dbgs()));
     989             :   LLVM_DEBUG(dbgs() << "    ");
     990             :   LLVM_DEBUG((BitExtMI)->print(dbgs()));
     991             :   LLVM_DEBUG(dbgs() << "\n");
     992             : 
     993             :   // Erase the old instructions.
     994          63 :   LoadI->eraseFromParent();
     995          63 :   return NextI;
     996             : }
     997             : 
     998             : static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
     999             :   // Convert the byte-offset used by unscaled into an "element" offset used
    1000             :   // by the scaled pair load/store instructions.
    1001        6989 :   if (IsUnscaled) {
    1002             :     // If the byte-offset isn't a multiple of the stride, there's no point
    1003             :     // trying to match it.
    1004         299 :     if (Offset % OffsetStride)
    1005             :       return false;
    1006         217 :     Offset /= OffsetStride;
    1007             :   }
    1008        6907 :   return Offset <= 63 && Offset >= -64;
    1009             : }
    1010             : 
    1011             : // Do alignment, specialized to power of 2 and for signed ints,
    1012             : // avoiding having to do a C-style cast from uint_64t to int when
    1013             : // using alignTo from include/llvm/Support/MathExtras.h.
    1014             : // FIXME: Move this function to include/MathExtras.h?
    1015             : static int alignTo(int Num, int PowOf2) {
    1016          81 :   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
    1017             : }
    1018             : 
    1019         593 : static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
    1020             :                      AliasAnalysis *AA) {
    1021             :   // One of the instructions must modify memory.
    1022         593 :   if (!MIa.mayStore() && !MIb.mayStore())
    1023             :     return false;
    1024             : 
    1025             :   // Both instructions must be memory operations.
    1026         553 :   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
    1027             :     return false;
    1028             : 
    1029         553 :   return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
    1030             : }
    1031             : 
    1032             : static bool mayAlias(MachineInstr &MIa,
    1033             :                      SmallVectorImpl<MachineInstr *> &MemInsns,
    1034             :                      AliasAnalysis *AA) {
    1035        1192 :   for (MachineInstr *MIb : MemInsns)
    1036         165 :     if (mayAlias(MIa, *MIb, AA))
    1037             :       return true;
    1038             : 
    1039             :   return false;
    1040             : }
    1041             : 
    1042        1331 : bool AArch64LoadStoreOpt::findMatchingStore(
    1043             :     MachineBasicBlock::iterator I, unsigned Limit,
    1044             :     MachineBasicBlock::iterator &StoreI) {
    1045        1331 :   MachineBasicBlock::iterator B = I->getParent()->begin();
    1046        1331 :   MachineBasicBlock::iterator MBBI = I;
    1047             :   MachineInstr &LoadMI = *I;
    1048        1331 :   unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
    1049             : 
    1050             :   // If the load is the first instruction in the block, there's obviously
    1051             :   // not any matching store.
    1052        1331 :   if (MBBI == B)
    1053             :     return false;
    1054             : 
    1055             :   // Track which register units have been modified and used between the first
    1056             :   // insn and the second insn.
    1057             :   ModifiedRegUnits.clear();
    1058             :   UsedRegUnits.clear();
    1059             : 
    1060             :   unsigned Count = 0;
    1061             :   do {
    1062             :     --MBBI;
    1063             :     MachineInstr &MI = *MBBI;
    1064             : 
    1065             :     // Don't count transient instructions towards the search limit since there
    1066             :     // may be different numbers of them if e.g. debug information is present.
    1067             :     if (!MI.isTransient())
    1068        3226 :       ++Count;
    1069             : 
    1070             :     // If the load instruction reads directly from the address to which the
    1071             :     // store instruction writes and the stored value is not modified, we can
    1072             :     // promote the load. Since we do not handle stores with pre-/post-index,
    1073             :     // it's unnecessary to check if BaseReg is modified by the store itself.
    1074        4449 :     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
    1075         508 :         BaseReg == getLdStBaseOp(MI).getReg() &&
    1076        3696 :         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
    1077         152 :         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
    1078          65 :       StoreI = MBBI;
    1079          65 :       return true;
    1080             :     }
    1081             : 
    1082        3456 :     if (MI.isCall())
    1083             :       return false;
    1084             : 
    1085             :     // Update modified / uses register units.
    1086        3217 :     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
    1087             : 
    1088             :     // Otherwise, if the base register is modified, we have no match, so
    1089             :     // return early.
    1090        3217 :     if (!ModifiedRegUnits.available(BaseReg))
    1091             :       return false;
    1092             : 
    1093             :     // If we encounter a store aliased with the load, return early.
    1094        2870 :     if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
    1095             :       return false;
    1096        2730 :   } while (MBBI != B && Count < Limit);
    1097             :   return false;
    1098             : }
    1099             : 
    1100             : // Returns true if FirstMI and MI are candidates for merging or pairing.
    1101             : // Otherwise, returns false.
    1102       17458 : static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
    1103             :                                        LdStPairFlags &Flags,
    1104             :                                        const AArch64InstrInfo *TII) {
    1105             :   // If this is volatile or if pairing is suppressed, not a candidate.
    1106       17458 :   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
    1107             :     return false;
    1108             : 
    1109             :   // We should have already checked FirstMI for pair suppression and volatility.
    1110             :   assert(!FirstMI.hasOrderedMemoryRef() &&
    1111             :          !TII->isLdStPairSuppressed(FirstMI) &&
    1112             :          "FirstMI shouldn't get here if either of these checks are true.");
    1113             : 
    1114       13316 :   unsigned OpcA = FirstMI.getOpcode();
    1115       13316 :   unsigned OpcB = MI.getOpcode();
    1116             : 
    1117             :   // Opcodes match: nothing more to check.
    1118       13316 :   if (OpcA == OpcB)
    1119             :     return true;
    1120             : 
    1121             :   // Try to match a sign-extended load/store with a zero-extended load/store.
    1122             :   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
    1123       10562 :   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
    1124             :   assert(IsValidLdStrOpc &&
    1125             :          "Given Opc should be a Load or Store with an immediate");
    1126             :   // OpcA will be the first instruction in the pair.
    1127       10562 :   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
    1128           6 :     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
    1129             :     return true;
    1130             :   }
    1131             : 
    1132             :   // If the second instruction isn't even a mergable/pairable load/store, bail
    1133             :   // out.
    1134       10556 :   if (!PairIsValidLdStrOpc)
    1135             :     return false;
    1136             : 
    1137             :   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
    1138             :   // offsets.
    1139             :   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
    1140             :     return false;
    1141             : 
    1142             :   // Try to match an unscaled load/store with a scaled load/store.
    1143        2266 :   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
    1144         141 :          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
    1145             : 
    1146             :   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
    1147             : }
    1148             : 
    1149             : /// Scan the instructions looking for a load/store that can be combined with the
    1150             : /// current instruction into a wider equivalent or a load/store pair.
    1151             : MachineBasicBlock::iterator
    1152        5861 : AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    1153             :                                       LdStPairFlags &Flags, unsigned Limit,
    1154             :                                       bool FindNarrowMerge) {
    1155        5861 :   MachineBasicBlock::iterator E = I->getParent()->end();
    1156        5861 :   MachineBasicBlock::iterator MBBI = I;
    1157             :   MachineInstr &FirstMI = *I;
    1158             :   ++MBBI;
    1159             : 
    1160        5861 :   bool MayLoad = FirstMI.mayLoad();
    1161             :   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
    1162        5861 :   unsigned Reg = getLdStRegOp(FirstMI).getReg();
    1163        5861 :   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
    1164        5861 :   int Offset = getLdStOffsetOp(FirstMI).getImm();
    1165        5861 :   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
    1166             :   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
    1167             : 
    1168             :   // Track which register units have been modified and used between the first
    1169             :   // insn (inclusive) and the second insn.
    1170             :   ModifiedRegUnits.clear();
    1171             :   UsedRegUnits.clear();
    1172             : 
    1173             :   // Remember any instructions that read/write memory between FirstMI and MI.
    1174             :   SmallVector<MachineInstr *, 4> MemInsns;
    1175             : 
    1176       20669 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1177             :     MachineInstr &MI = *MBBI;
    1178             : 
    1179             :     // Don't count transient instructions towards the search limit since there
    1180             :     // may be different numbers of them if e.g. debug information is present.
    1181             :     if (!MI.isTransient())
    1182       16667 :       ++Count;
    1183             : 
    1184             :     Flags.setSExtIdx(-1);
    1185       20271 :     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
    1186             :         getLdStOffsetOp(MI).isImm()) {
    1187             :       assert(MI.mayLoadOrStore() && "Expected memory operation.");
    1188             :       // If we've found another instruction with the same opcode, check to see
    1189             :       // if the base and offset are compatible with our starting instruction.
    1190             :       // These instructions all have scaled immediate operands, so we just
    1191             :       // check for +1/-1. Make sure to check the new instruction offset is
    1192             :       // actually an immediate and not a symbolic reference destined for
    1193             :       // a relocation.
    1194        2805 :       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
    1195        2805 :       int MIOffset = getLdStOffsetOp(MI).getImm();
    1196             :       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
    1197        2805 :       if (IsUnscaled != MIIsUnscaled) {
    1198             :         // We're trying to pair instructions that differ in how they are scaled.
    1199             :         // If FirstMI is scaled then scale the offset of MI accordingly.
    1200             :         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
    1201          53 :         int MemSize = getMemScale(MI);
    1202          53 :         if (MIIsUnscaled) {
    1203             :           // If the unscaled offset isn't a multiple of the MemSize, we can't
    1204             :           // pair the operations together: bail and keep looking.
    1205          15 :           if (MIOffset % MemSize) {
    1206           3 :             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
    1207             :                                               UsedRegUnits, TRI);
    1208           3 :             MemInsns.push_back(&MI);
    1209           3 :             continue;
    1210             :           }
    1211           9 :           MIOffset /= MemSize;
    1212             :         } else {
    1213          41 :           MIOffset *= MemSize;
    1214             :         }
    1215             :       }
    1216             : 
    1217        4272 :       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
    1218        1470 :                                    (Offset + OffsetStride == MIOffset))) {
    1219        1124 :         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
    1220        1124 :         if (FindNarrowMerge) {
    1221             :           // If the alignment requirements of the scaled wide load/store
    1222             :           // instruction can't express the offset of the scaled narrow input,
    1223             :           // bail and keep looking. For promotable zero stores, allow only when
    1224             :           // the stored value is the same (i.e., WZR).
    1225          49 :           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
    1226          21 :               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
    1227           7 :             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
    1228             :                                               UsedRegUnits, TRI);
    1229           7 :             MemInsns.push_back(&MI);
    1230           7 :             continue;
    1231             :           }
    1232             :         } else {
    1233             :           // Pairwise instructions have a 7-bit signed offset field. Single
    1234             :           // insns have a 12-bit unsigned offset field.  If the resultant
    1235             :           // immediate offset of merging these instructions is out of range for
    1236             :           // a pairwise instruction, bail and keep looking.
    1237        1102 :           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
    1238           0 :             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
    1239             :                                               UsedRegUnits, TRI);
    1240           0 :             MemInsns.push_back(&MI);
    1241           0 :             continue;
    1242             :           }
    1243             :           // If the alignment requirements of the paired (scaled) instruction
    1244             :           // can't express the offset of the unscaled input, bail and keep
    1245             :           // looking.
    1246        1163 :           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
    1247           0 :             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
    1248             :                                               UsedRegUnits, TRI);
    1249           0 :             MemInsns.push_back(&MI);
    1250           0 :             continue;
    1251             :           }
    1252             :         }
    1253             :         // If the destination register of the loads is the same register, bail
    1254             :         // and keep looking. A load-pair instruction with both destination
    1255             :         // registers the same is UNPREDICTABLE and will result in an exception.
    1256        1684 :         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
    1257          32 :           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
    1258             :                                             TRI);
    1259          32 :           MemInsns.push_back(&MI);
    1260          32 :           continue;
    1261             :         }
    1262             : 
    1263             :         // If the Rt of the second instruction was not modified or used between
    1264             :         // the two instructions and none of the instructions between the second
    1265             :         // and first alias with the second, we can combine the second into the
    1266             :         // first.
    1267        3075 :         if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
    1268        1386 :             !(MI.mayLoad() &&
    1269        2920 :               !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
    1270         873 :             !mayAlias(MI, MemInsns, AA)) {
    1271             :           Flags.setMergeForward(false);
    1272         858 :           return MBBI;
    1273             :         }
    1274             : 
    1275             :         // Likewise, if the Rt of the first instruction is not modified or used
    1276             :         // between the two instructions and none of the instructions between the
    1277             :         // first and the second alias with the first, we can combine the first
    1278             :         // into the second.
    1279         302 :         if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
    1280          36 :             !(MayLoad &&
    1281         342 :               !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
    1282          43 :             !mayAlias(FirstMI, MemInsns, AA)) {
    1283             :           Flags.setMergeForward(true);
    1284          31 :           return MBBI;
    1285             :         }
    1286             :         // Unable to combine these instructions due to interference in between.
    1287             :         // Keep looking.
    1288             :       }
    1289             :     }
    1290             : 
    1291             :     // If the instruction wasn't a matching load or store.  Stop searching if we
    1292             :     // encounter a call instruction that might modify memory.
    1293       16527 :     if (MI.isCall())
    1294         864 :       return E;
    1295             : 
    1296             :     // Update modified / uses register units.
    1297       15663 :     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
    1298             : 
    1299             :     // Otherwise, if the base register is modified, we have no match, so
    1300             :     // return early.
    1301       15663 :     if (!ModifiedRegUnits.available(BaseReg))
    1302         897 :       return E;
    1303             : 
    1304             :     // Update list of instructions that read/write memory.
    1305       14766 :     if (MI.mayLoadOrStore())
    1306        4706 :       MemInsns.push_back(&MI);
    1307             :   }
    1308        3211 :   return E;
    1309             : }
    1310             : 
    1311             : MachineBasicBlock::iterator
    1312         197 : AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
    1313             :                                      MachineBasicBlock::iterator Update,
    1314             :                                      bool IsPreIdx) {
    1315             :   assert((Update->getOpcode() == AArch64::ADDXri ||
    1316             :           Update->getOpcode() == AArch64::SUBXri) &&
    1317             :          "Unexpected base register update instruction to merge!");
    1318         197 :   MachineBasicBlock::iterator NextI = I;
    1319             :   // Return the instruction following the merged instruction, which is
    1320             :   // the instruction following our unmerged load. Unless that's the add/sub
    1321             :   // instruction we're merging, in which case it's the one after that.
    1322         197 :   if (++NextI == Update)
    1323             :     ++NextI;
    1324             : 
    1325         197 :   int Value = Update->getOperand(2).getImm();
    1326             :   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
    1327             :          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
    1328         394 :   if (Update->getOpcode() == AArch64::SUBXri)
    1329          50 :     Value = -Value;
    1330             : 
    1331         394 :   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
    1332         108 :                              : getPostIndexedOpcode(I->getOpcode());
    1333             :   MachineInstrBuilder MIB;
    1334         197 :   if (!isPairedLdSt(*I)) {
    1335             :     // Non-paired instruction.
    1336         324 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1337             :               .add(getLdStRegOp(*Update))
    1338             :               .add(getLdStRegOp(*I))
    1339             :               .add(getLdStBaseOp(*I))
    1340         162 :               .addImm(Value)
    1341         162 :               .setMemRefs(I->memoperands_begin(), I->memoperands_end())
    1342         162 :               .setMIFlags(I->mergeFlagsWith(*Update));
    1343             :   } else {
    1344             :     // Paired instruction.
    1345          35 :     int Scale = getMemScale(*I);
    1346          70 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1347             :               .add(getLdStRegOp(*Update))
    1348             :               .add(getLdStRegOp(*I, 0))
    1349             :               .add(getLdStRegOp(*I, 1))
    1350             :               .add(getLdStBaseOp(*I))
    1351          35 :               .addImm(Value / Scale)
    1352          35 :               .setMemRefs(I->memoperands_begin(), I->memoperands_end())
    1353          35 :               .setMIFlags(I->mergeFlagsWith(*Update));
    1354             :   }
    1355             :   (void)MIB;
    1356             : 
    1357             :   if (IsPreIdx) {
    1358             :     ++NumPreFolded;
    1359             :     LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
    1360             :   } else {
    1361             :     ++NumPostFolded;
    1362             :     LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
    1363             :   }
    1364             :   LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
    1365             :   LLVM_DEBUG(I->print(dbgs()));
    1366             :   LLVM_DEBUG(dbgs() << "    ");
    1367             :   LLVM_DEBUG(Update->print(dbgs()));
    1368             :   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
    1369             :   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    1370             :   LLVM_DEBUG(dbgs() << "\n");
    1371             : 
    1372             :   // Erase the old instructions for the block.
    1373         197 :   I->eraseFromParent();
    1374         197 :   Update->eraseFromParent();
    1375             : 
    1376         197 :   return NextI;
    1377             : }
    1378             : 
    1379       33193 : bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
    1380             :                                                MachineInstr &MI,
    1381             :                                                unsigned BaseReg, int Offset) {
    1382       66386 :   switch (MI.getOpcode()) {
    1383             :   default:
    1384             :     break;
    1385        1558 :   case AArch64::SUBXri:
    1386             :   case AArch64::ADDXri:
    1387             :     // Make sure it's a vanilla immediate operand, not a relocation or
    1388             :     // anything else we can't handle.
    1389        3116 :     if (!MI.getOperand(2).isImm())
    1390             :       break;
    1391             :     // Watch out for 1 << 12 shifted value.
    1392        2814 :     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
    1393             :       break;
    1394             : 
    1395             :     // The update instruction source and destination register must be the
    1396             :     // same as the load/store base register.
    1397        2017 :     if (MI.getOperand(0).getReg() != BaseReg ||
    1398         626 :         MI.getOperand(1).getReg() != BaseReg)
    1399             :       break;
    1400             : 
    1401         549 :     bool IsPairedInsn = isPairedLdSt(MemMI);
    1402         549 :     int UpdateOffset = MI.getOperand(2).getImm();
    1403         549 :     if (MI.getOpcode() == AArch64::SUBXri)
    1404          51 :       UpdateOffset = -UpdateOffset;
    1405             : 
    1406             :     // For non-paired load/store instructions, the immediate must fit in a
    1407             :     // signed 9-bit integer.
    1408         549 :     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
    1409             :       break;
    1410             : 
    1411             :     // For paired load/store instructions, the immediate must be a multiple of
    1412             :     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
    1413             :     // integer.
    1414         541 :     if (IsPairedInsn) {
    1415         179 :       int Scale = getMemScale(MemMI);
    1416         179 :       if (UpdateOffset % Scale != 0)
    1417             :         break;
    1418             : 
    1419         179 :       int ScaledOffset = UpdateOffset / Scale;
    1420         179 :       if (ScaledOffset > 63 || ScaledOffset < -64)
    1421             :         break;
    1422             :     }
    1423             : 
    1424             :     // If we have a non-zero Offset, we check that it matches the amount
    1425             :     // we're adding to the register.
    1426         533 :     if (!Offset || Offset == UpdateOffset)
    1427             :       return true;
    1428             :     break;
    1429             :   }
    1430             :   return false;
    1431             : }
    1432             : 
    1433       16949 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
    1434             :     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
    1435       16949 :   MachineBasicBlock::iterator E = I->getParent()->end();
    1436             :   MachineInstr &MemMI = *I;
    1437       16949 :   MachineBasicBlock::iterator MBBI = I;
    1438             : 
    1439       16949 :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1440       16949 :   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
    1441             : 
    1442             :   // Scan forward looking for post-index opportunities.  Updating instructions
    1443             :   // can't be formed if the memory instruction doesn't have the offset we're
    1444             :   // looking for.
    1445       16949 :   if (MIUnscaledOffset != UnscaledOffset)
    1446        4508 :     return E;
    1447             : 
    1448             :   // If the base register overlaps a destination register, we can't
    1449             :   // merge the update.
    1450             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1451       42233 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1452       15156 :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1453       30182 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1454         260 :       return E;
    1455             :   }
    1456             : 
    1457             :   // Track which register units have been modified and used between the first
    1458             :   // insn (inclusive) and the second insn.
    1459             :   ModifiedRegUnits.clear();
    1460             :   UsedRegUnits.clear();
    1461             :   ++MBBI;
    1462       33592 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1463             :     MachineInstr &MI = *MBBI;
    1464             : 
    1465             :     // Don't count transient instructions towards the search limit since there
    1466             :     // may be different numbers of them if e.g. debug information is present.
    1467             :     if (!MI.isTransient())
    1468       26030 :       ++Count;
    1469             : 
    1470             :     // If we found a match, return it.
    1471       27367 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
    1472         145 :       return MBBI;
    1473             : 
    1474             :     // Update the status of what the instruction clobbered and used.
    1475       27222 :     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
    1476             : 
    1477             :     // Otherwise, if the base register is used or modified, we have no match, so
    1478             :     // return early.
    1479       52625 :     if (!ModifiedRegUnits.available(BaseReg) ||
    1480       25403 :         !UsedRegUnits.available(BaseReg))
    1481        5811 :       return E;
    1482             :   }
    1483        6225 :   return E;
    1484             : }
    1485             : 
    1486        8319 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
    1487             :     MachineBasicBlock::iterator I, unsigned Limit) {
    1488        8319 :   MachineBasicBlock::iterator B = I->getParent()->begin();
    1489             :   MachineBasicBlock::iterator E = I->getParent()->end();
    1490             :   MachineInstr &MemMI = *I;
    1491        8319 :   MachineBasicBlock::iterator MBBI = I;
    1492             : 
    1493        8319 :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1494        8319 :   int Offset = getLdStOffsetOp(MemMI).getImm();
    1495             : 
    1496             :   // If the load/store is the first instruction in the block, there's obviously
    1497             :   // not any matching update. Ditto if the memory offset isn't zero.
    1498        8319 :   if (MBBI == B || Offset != 0)
    1499        5640 :     return E;
    1500             :   // If the base register overlaps a destination register, we can't
    1501             :   // merge the update.
    1502             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1503        8387 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1504        2931 :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1505        5816 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1506          77 :       return E;
    1507             :   }
    1508             : 
    1509             :   // Track which register units have been modified and used between the first
    1510             :   // insn (inclusive) and the second insn.
    1511             :   ModifiedRegUnits.clear();
    1512             :   UsedRegUnits.clear();
    1513             :   unsigned Count = 0;
    1514             :   do {
    1515             :     --MBBI;
    1516             :     MachineInstr &MI = *MBBI;
    1517             : 
    1518             :     // Don't count transient instructions towards the search limit since there
    1519             :     // may be different numbers of them if e.g. debug information is present.
    1520             :     if (!MI.isTransient())
    1521        5103 :       ++Count;
    1522             : 
    1523             :     // If we found a match, return it.
    1524        5826 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
    1525          52 :       return MBBI;
    1526             : 
    1527             :     // Update the status of what the instruction clobbered and used.
    1528        5774 :     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
    1529             : 
    1530             :     // Otherwise, if the base register is used or modified, we have no match, so
    1531             :     // return early.
    1532       11054 :     if (!ModifiedRegUnits.available(BaseReg) ||
    1533        5280 :         !UsedRegUnits.available(BaseReg))
    1534        1074 :       return E;
    1535        7924 :   } while (MBBI != B && Count < Limit);
    1536        1476 :   return E;
    1537             : }
    1538             : 
    1539        2396 : bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
    1540             :     MachineBasicBlock::iterator &MBBI) {
    1541             :   MachineInstr &MI = *MBBI;
    1542             :   // If this is a volatile load, don't mess with it.
    1543        2396 :   if (MI.hasOrderedMemoryRef())
    1544             :     return false;
    1545             : 
    1546             :   // Make sure this is a reg+imm.
    1547             :   // FIXME: It is possible to extend it to handle reg+reg cases.
    1548        1613 :   if (!getLdStOffsetOp(MI).isImm())
    1549             :     return false;
    1550             : 
    1551             :   // Look backward up to LdStLimit instructions.
    1552             :   MachineBasicBlock::iterator StoreI;
    1553        1331 :   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
    1554             :     ++NumLoadsFromStoresPromoted;
    1555             :     // Promote the load. Keeping the iterator straight is a
    1556             :     // pain, so we let the merge routine tell us what the next instruction
    1557             :     // is after it's done mucking about.
    1558          65 :     MBBI = promoteLoadFromStore(MBBI, StoreI);
    1559          65 :     return true;
    1560             :   }
    1561             :   return false;
    1562             : }
    1563             : 
    1564             : // Merge adjacent zero stores into a wider store.
    1565          96 : bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
    1566             :     MachineBasicBlock::iterator &MBBI) {
    1567             :   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
    1568             :   MachineInstr &MI = *MBBI;
    1569          96 :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1570             : 
    1571          96 :   if (!TII->isCandidateToMergeOrPair(MI))
    1572             :     return false;
    1573             : 
    1574             :   // Look ahead up to LdStLimit instructions for a mergable instruction.
    1575          87 :   LdStPairFlags Flags;
    1576             :   MachineBasicBlock::iterator MergeMI =
    1577          87 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
    1578          87 :   if (MergeMI != E) {
    1579             :     ++NumZeroStoresPromoted;
    1580             : 
    1581             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1582             :     // us what the next instruction is after it's done mucking about.
    1583          15 :     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
    1584          15 :     return true;
    1585             :   }
    1586             :   return false;
    1587             : }
    1588             : 
    1589             : // Find loads and stores that can be merged into a single load or store pair
    1590             : // instruction.
    1591        8431 : bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
    1592             :   MachineInstr &MI = *MBBI;
    1593        8431 :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1594             : 
    1595        8431 :   if (!TII->isCandidateToMergeOrPair(MI))
    1596             :     return false;
    1597             : 
    1598             :   // Early exit if the offset is not possible to match. (6 bits of positive
    1599             :   // range, plus allow an extra one in case we find a later insn that matches
    1600             :   // with Offset-1)
    1601             :   bool IsUnscaled = TII->isUnscaledLdSt(MI);
    1602        5887 :   int Offset = getLdStOffsetOp(MI).getImm();
    1603        5887 :   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
    1604             :   // Allow one more for offset.
    1605        5887 :   if (Offset > 0)
    1606        2125 :     Offset -= OffsetStride;
    1607        5805 :   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    1608             :     return false;
    1609             : 
    1610             :   // Look ahead up to LdStLimit instructions for a pairable instruction.
    1611        5774 :   LdStPairFlags Flags;
    1612             :   MachineBasicBlock::iterator Paired =
    1613        5774 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
    1614        5774 :   if (Paired != E) {
    1615             :     ++NumPairCreated;
    1616             :     if (TII->isUnscaledLdSt(MI))
    1617             :       ++NumUnscaledPairCreated;
    1618             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1619             :     // us what the next instruction is after it's done mucking about.
    1620         874 :     MBBI = mergePairedInsns(MBBI, Paired, Flags);
    1621         874 :     return true;
    1622             :   }
    1623             :   return false;
    1624             : }
    1625             : 
    1626        8682 : bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
    1627             :     (MachineBasicBlock::iterator &MBBI) {
    1628             :   MachineInstr &MI = *MBBI;
    1629        8682 :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1630             :   MachineBasicBlock::iterator Update;
    1631             : 
    1632             :   // Look forward to try to form a post-index instruction. For example,
    1633             :   // ldr x0, [x20]
    1634             :   // add x20, x20, #32
    1635             :   //   merged into:
    1636             :   // ldr x0, [x20], #32
    1637        8682 :   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
    1638        8682 :   if (Update != E) {
    1639             :     // Merge the update into the ld/st.
    1640         108 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
    1641         108 :     return true;
    1642             :   }
    1643             : 
    1644             :   // Don't know how to handle unscaled pre/post-index versions below, so bail.
    1645       17148 :   if (TII->isUnscaledLdSt(MI.getOpcode()))
    1646             :     return false;
    1647             : 
    1648             :   // Look back to try to find a pre-index instruction. For example,
    1649             :   // add x0, x0, #8
    1650             :   // ldr x1, [x0]
    1651             :   //   merged into:
    1652             :   // ldr x1, [x0, #8]!
    1653        8319 :   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
    1654        8319 :   if (Update != E) {
    1655             :     // Merge the update into the ld/st.
    1656          52 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1657          52 :     return true;
    1658             :   }
    1659             : 
    1660             :   // The immediate in the load/store is scaled by the size of the memory
    1661             :   // operation. The immediate in the add we're looking for,
    1662             :   // however, is not, so adjust here.
    1663        8267 :   int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
    1664             : 
    1665             :   // Look forward to try to find a post-index instruction. For example,
    1666             :   // ldr x1, [x0, #64]
    1667             :   // add x0, x0, #64
    1668             :   //   merged into:
    1669             :   // ldr x1, [x0, #64]!
    1670        8267 :   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
    1671        8267 :   if (Update != E) {
    1672             :     // Merge the update into the ld/st.
    1673          37 :     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1674          37 :     return true;
    1675             :   }
    1676             : 
    1677             :   return false;
    1678             : }
    1679             : 
    1680       15005 : bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
    1681             :                                         bool EnableNarrowZeroStOpt) {
    1682             :   bool Modified = false;
    1683             :   // Four tranformations to do here:
    1684             :   // 1) Find loads that directly read from stores and promote them by
    1685             :   //    replacing with mov instructions. If the store is wider than the load,
    1686             :   //    the load will be replaced with a bitfield extract.
    1687             :   //      e.g.,
    1688             :   //        str w1, [x0, #4]
    1689             :   //        ldrh w2, [x0, #6]
    1690             :   //        ; becomes
    1691             :   //        str w1, [x0, #4]
    1692             :   //        lsr w2, w1, #16
    1693       15005 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1694       83457 :        MBBI != E;) {
    1695       68452 :     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
    1696             :       Modified = true;
    1697             :     else
    1698             :       ++MBBI;
    1699             :   }
    1700             :   // 2) Merge adjacent zero stores into a wider store.
    1701             :   //      e.g.,
    1702             :   //        strh wzr, [x0]
    1703             :   //        strh wzr, [x0, #2]
    1704             :   //        ; becomes
    1705             :   //        str wzr, [x0]
    1706             :   //      e.g.,
    1707             :   //        str wzr, [x0]
    1708             :   //        str wzr, [x0, #4]
    1709             :   //        ; becomes
    1710             :   //        str xzr, [x0]
    1711       15005 :   if (EnableNarrowZeroStOpt)
    1712       14820 :     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1713       82518 :          MBBI != E;) {
    1714          96 :       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
    1715             :         Modified = true;
    1716             :       else
    1717             :         ++MBBI;
    1718             :     }
    1719             :   // 3) Find loads and stores that can be merged into a single load or store
    1720             :   //    pair instruction.
    1721             :   //      e.g.,
    1722             :   //        ldr x0, [x2]
    1723             :   //        ldr x1, [x2, #8]
    1724             :   //        ; becomes
    1725             :   //        ldp x0, x1, [x2]
    1726       15005 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1727       82597 :        MBBI != E;) {
    1728       67592 :     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
    1729             :       Modified = true;
    1730             :     else
    1731             :       ++MBBI;
    1732             :   }
    1733             :   // 4) Find base register updates that can be merged into the load or store
    1734             :   //    as a base-reg writeback.
    1735             :   //      e.g.,
    1736             :   //        ldr x0, [x2]
    1737             :   //        add x2, x2, #4
    1738             :   //        ; becomes
    1739             :   //        ldr x0, [x2], #4
    1740       15005 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1741       82433 :        MBBI != E;) {
    1742       67428 :     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
    1743             :       Modified = true;
    1744             :     else
    1745             :       ++MBBI;
    1746             :   }
    1747             : 
    1748       15005 :   return Modified;
    1749             : }
    1750             : 
    1751       12948 : bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
    1752       12948 :   if (skipFunction(Fn.getFunction()))
    1753             :     return false;
    1754             : 
    1755       12943 :   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
    1756       12943 :   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
    1757       12943 :   TRI = Subtarget->getRegisterInfo();
    1758       25886 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
    1759             : 
    1760             :   // Resize the modified and used register unit trackers.  We do this once
    1761             :   // per function and then clear the register units each time we optimize a load
    1762             :   // or store.
    1763       12943 :   ModifiedRegUnits.init(*TRI);
    1764       12943 :   UsedRegUnits.init(*TRI);
    1765             : 
    1766             :   bool Modified = false;
    1767       12943 :   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
    1768       27948 :   for (auto &MBB : Fn)
    1769       15005 :     Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
    1770             : 
    1771             :   return Modified;
    1772             : }
    1773             : 
    1774             : // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
    1775             : // stores near one another?  Note: The pre-RA instruction scheduler already has
    1776             : // hooks to try and schedule pairable loads/stores together to improve pairing
    1777             : // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
    1778             : 
    1779             : // FIXME: When pairing store instructions it's very possible for this pass to
    1780             : // hoist a store with a KILL marker above another use (without a KILL marker).
    1781             : // The resulting IR is invalid, but nothing uses the KILL markers after this
    1782             : // pass, so it's never caused a problem in practice.
    1783             : 
    1784             : /// createAArch64LoadStoreOptimizationPass - returns an instance of the
    1785             : /// load / store optimization pass.
    1786        1023 : FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
    1787        1023 :   return new AArch64LoadStoreOpt();
    1788      303507 : }

Generated by: LCOV version 1.13