LCOV - code coverage report
Current view: top level - lib/Target/AArch64 - AArch64LoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 652 699 93.3 %
Date: 2017-09-14 15:23:50 Functions: 36 36 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file contains a pass that performs load / store related peephole
      11             : // optimizations. This pass should be run after register allocation.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AArch64InstrInfo.h"
      16             : #include "AArch64Subtarget.h"
      17             : #include "MCTargetDesc/AArch64AddressingModes.h"
      18             : #include "llvm/ADT/BitVector.h"
      19             : #include "llvm/ADT/SmallVector.h"
      20             : #include "llvm/ADT/Statistic.h"
      21             : #include "llvm/ADT/StringRef.h"
      22             : #include "llvm/ADT/iterator_range.h"
      23             : #include "llvm/Analysis/AliasAnalysis.h"
      24             : #include "llvm/CodeGen/MachineBasicBlock.h"
      25             : #include "llvm/CodeGen/MachineFunction.h"
      26             : #include "llvm/CodeGen/MachineFunctionPass.h"
      27             : #include "llvm/CodeGen/MachineInstr.h"
      28             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      29             : #include "llvm/CodeGen/MachineOperand.h"
      30             : #include "llvm/IR/DebugLoc.h"
      31             : #include "llvm/MC/MCRegisterInfo.h"
      32             : #include "llvm/Pass.h"
      33             : #include "llvm/Support/CommandLine.h"
      34             : #include "llvm/Support/Debug.h"
      35             : #include "llvm/Support/ErrorHandling.h"
      36             : #include "llvm/Support/raw_ostream.h"
      37             : #include "llvm/Target/TargetRegisterInfo.h"
      38             : #include <cassert>
      39             : #include <cstdint>
      40             : #include <iterator>
      41             : #include <limits>
      42             : 
      43             : using namespace llvm;
      44             : 
      45             : #define DEBUG_TYPE "aarch64-ldst-opt"
      46             : 
      47             : STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
      48             : STATISTIC(NumPostFolded, "Number of post-index updates folded");
      49             : STATISTIC(NumPreFolded, "Number of pre-index updates folded");
      50             : STATISTIC(NumUnscaledPairCreated,
      51             :           "Number of load/store from unscaled generated");
      52             : STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
      53             : STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
      54             : 
      55             : // The LdStLimit limits how far we search for load/store pairs.
      56       72306 : static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
      57      144612 :                                    cl::init(20), cl::Hidden);
      58             : 
      59             : // The UpdateLimit limits how far we search for update instructions when we form
      60             : // pre-/post-index instructions.
      61      289224 : static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
      62      216918 :                                      cl::Hidden);
      63             : 
      64             : #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
      65             : 
      66             : namespace {
      67             : 
      68             : using LdStPairFlags = struct LdStPairFlags {
      69             :   // If a matching instruction is found, MergeForward is set to true if the
      70             :   // merge is to remove the first instruction and replace the second with
      71             :   // a pair-wise insn, and false if the reverse is true.
      72             :   bool MergeForward = false;
      73             : 
      74             :   // SExtIdx gives the index of the result of the load pair that must be
      75             :   // extended. The value of SExtIdx assumes that the paired load produces the
      76             :   // value in this order: (I, returned iterator), i.e., -1 means no value has
      77             :   // to be extended, 0 means I, and 1 means the returned iterator.
      78             :   int SExtIdx = -1;
      79             : 
      80             :   LdStPairFlags() = default;
      81             : 
      82         858 :   void setMergeForward(bool V = true) { MergeForward = V; }
      83             :   bool getMergeForward() const { return MergeForward; }
      84             : 
      85       16136 :   void setSExtIdx(int V) { SExtIdx = V; }
      86             :   int getSExtIdx() const { return SExtIdx; }
      87             : };
      88             : 
      89        3616 : struct AArch64LoadStoreOpt : public MachineFunctionPass {
      90             :   static char ID;
      91             : 
      92        2733 :   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
      93         911 :     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
      94         911 :   }
      95             : 
      96             :   AliasAnalysis *AA;
      97             :   const AArch64InstrInfo *TII;
      98             :   const TargetRegisterInfo *TRI;
      99             :   const AArch64Subtarget *Subtarget;
     100             : 
     101             :   // Track which registers have been modified and used.
     102             :   BitVector ModifiedRegs, UsedRegs;
     103             : 
     104         901 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     105         901 :     AU.addRequired<AAResultsWrapperPass>();
     106         901 :     MachineFunctionPass::getAnalysisUsage(AU);
     107         901 :   }
     108             : 
     109             :   // Scan the instructions looking for a load/store that can be combined
     110             :   // with the current instruction into a load/store pair.
     111             :   // Return the matching instruction if one is found, else MBB->end().
     112             :   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
     113             :                                                LdStPairFlags &Flags,
     114             :                                                unsigned Limit,
     115             :                                                bool FindNarrowMerge);
     116             : 
     117             :   // Scan the instructions looking for a store that writes to the address from
     118             :   // which the current load instruction reads. Return true if one is found.
     119             :   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
     120             :                          MachineBasicBlock::iterator &StoreI);
     121             : 
     122             :   // Merge the two instructions indicated into a wider narrow store instruction.
     123             :   MachineBasicBlock::iterator
     124             :   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     125             :                         MachineBasicBlock::iterator MergeMI,
     126             :                         const LdStPairFlags &Flags);
     127             : 
     128             :   // Merge the two instructions indicated into a single pair-wise instruction.
     129             :   MachineBasicBlock::iterator
     130             :   mergePairedInsns(MachineBasicBlock::iterator I,
     131             :                    MachineBasicBlock::iterator Paired,
     132             :                    const LdStPairFlags &Flags);
     133             : 
     134             :   // Promote the load that reads directly from the address stored to.
     135             :   MachineBasicBlock::iterator
     136             :   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     137             :                        MachineBasicBlock::iterator StoreI);
     138             : 
     139             :   // Scan the instruction list to find a base register update that can
     140             :   // be combined with the current instruction (a load or store) using
     141             :   // pre or post indexed addressing with writeback. Scan forwards.
     142             :   MachineBasicBlock::iterator
     143             :   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
     144             :                                 int UnscaledOffset, unsigned Limit);
     145             : 
     146             :   // Scan the instruction list to find a base register update that can
     147             :   // be combined with the current instruction (a load or store) using
     148             :   // pre or post indexed addressing with writeback. Scan backwards.
     149             :   MachineBasicBlock::iterator
     150             :   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
     151             : 
     152             :   // Find an instruction that updates the base register of the ld/st
     153             :   // instruction.
     154             :   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
     155             :                             unsigned BaseReg, int Offset);
     156             : 
     157             :   // Merge a pre- or post-index base register update into a ld/st instruction.
     158             :   MachineBasicBlock::iterator
     159             :   mergeUpdateInsn(MachineBasicBlock::iterator I,
     160             :                   MachineBasicBlock::iterator Update, bool IsPreIdx);
     161             : 
     162             :   // Find and merge zero store instructions.
     163             :   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
     164             : 
     165             :   // Find and pair ldr/str instructions.
     166             :   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
     167             : 
     168             :   // Find and promote load instructions which read directly from store.
     169             :   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
     170             : 
     171             :   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
     172             : 
     173             :   bool runOnMachineFunction(MachineFunction &Fn) override;
     174             : 
     175         901 :   MachineFunctionProperties getRequiredProperties() const override {
     176        2703 :     return MachineFunctionProperties().set(
     177        2703 :         MachineFunctionProperties::Property::NoVRegs);
     178             :   }
     179             : 
     180         907 :   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
     181             : };
     182             : 
     183             : char AArch64LoadStoreOpt::ID = 0;
     184             : 
     185             : } // end anonymous namespace
     186             : 
     187      315286 : INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
     188             :                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
     189             : 
     190             : static bool isNarrowStore(unsigned Opc) {
     191             :   switch (Opc) {
     192             :   default:
     193             :     return false;
     194             :   case AArch64::STRBBui:
     195             :   case AArch64::STURBBi:
     196             :   case AArch64::STRHHui:
     197             :   case AArch64::STURHHi:
     198             :     return true;
     199             :   }
     200             : }
     201             : 
     202             : // Scaling factor for unscaled load or store.
     203       24144 : static int getMemScale(MachineInstr &MI) {
     204       48288 :   switch (MI.getOpcode()) {
     205           0 :   default:
     206           0 :     llvm_unreachable("Opcode has unknown scale!");
     207             :   case AArch64::LDRBBui:
     208             :   case AArch64::LDURBBi:
     209             :   case AArch64::LDRSBWui:
     210             :   case AArch64::LDURSBWi:
     211             :   case AArch64::STRBBui:
     212             :   case AArch64::STURBBi:
     213             :     return 1;
     214         319 :   case AArch64::LDRHHui:
     215             :   case AArch64::LDURHHi:
     216             :   case AArch64::LDRSHWui:
     217             :   case AArch64::LDURSHWi:
     218             :   case AArch64::STRHHui:
     219             :   case AArch64::STURHHi:
     220         319 :     return 2;
     221        3193 :   case AArch64::LDRSui:
     222             :   case AArch64::LDURSi:
     223             :   case AArch64::LDRSWui:
     224             :   case AArch64::LDURSWi:
     225             :   case AArch64::LDRWui:
     226             :   case AArch64::LDURWi:
     227             :   case AArch64::STRSui:
     228             :   case AArch64::STURSi:
     229             :   case AArch64::STRWui:
     230             :   case AArch64::STURWi:
     231             :   case AArch64::LDPSi:
     232             :   case AArch64::LDPSWi:
     233             :   case AArch64::LDPWi:
     234             :   case AArch64::STPSi:
     235             :   case AArch64::STPWi:
     236        3193 :     return 4;
     237       14745 :   case AArch64::LDRDui:
     238             :   case AArch64::LDURDi:
     239             :   case AArch64::LDRXui:
     240             :   case AArch64::LDURXi:
     241             :   case AArch64::STRDui:
     242             :   case AArch64::STURDi:
     243             :   case AArch64::STRXui:
     244             :   case AArch64::STURXi:
     245             :   case AArch64::LDPDi:
     246             :   case AArch64::LDPXi:
     247             :   case AArch64::STPDi:
     248             :   case AArch64::STPXi:
     249       14745 :     return 8;
     250        5351 :   case AArch64::LDRQui:
     251             :   case AArch64::LDURQi:
     252             :   case AArch64::STRQui:
     253             :   case AArch64::STURQi:
     254             :   case AArch64::LDPQi:
     255             :   case AArch64::STPQi:
     256        5351 :     return 16;
     257             :   }
     258             : }
     259             : 
     260       18436 : static unsigned getMatchingNonSExtOpcode(unsigned Opc,
     261             :                                          bool *IsValidLdStrOpc = nullptr) {
     262       18436 :   if (IsValidLdStrOpc)
     263       18430 :     *IsValidLdStrOpc = true;
     264       18436 :   switch (Opc) {
     265        7076 :   default:
     266        7076 :     if (IsValidLdStrOpc)
     267        7076 :       *IsValidLdStrOpc = false;
     268             :     return std::numeric_limits<unsigned>::max();
     269             :   case AArch64::STRDui:
     270             :   case AArch64::STURDi:
     271             :   case AArch64::STRQui:
     272             :   case AArch64::STURQi:
     273             :   case AArch64::STRBBui:
     274             :   case AArch64::STURBBi:
     275             :   case AArch64::STRHHui:
     276             :   case AArch64::STURHHi:
     277             :   case AArch64::STRWui:
     278             :   case AArch64::STURWi:
     279             :   case AArch64::STRXui:
     280             :   case AArch64::STURXi:
     281             :   case AArch64::LDRDui:
     282             :   case AArch64::LDURDi:
     283             :   case AArch64::LDRQui:
     284             :   case AArch64::LDURQi:
     285             :   case AArch64::LDRWui:
     286             :   case AArch64::LDURWi:
     287             :   case AArch64::LDRXui:
     288             :   case AArch64::LDURXi:
     289             :   case AArch64::STRSui:
     290             :   case AArch64::STURSi:
     291             :   case AArch64::LDRSui:
     292             :   case AArch64::LDURSi:
     293             :     return Opc;
     294          13 :   case AArch64::LDRSWui:
     295          13 :     return AArch64::LDRWui;
     296           9 :   case AArch64::LDURSWi:
     297           9 :     return AArch64::LDURWi;
     298             :   }
     299             : }
     300             : 
     301             : static unsigned getMatchingWideOpcode(unsigned Opc) {
     302          21 :   switch (Opc) {
     303           0 :   default:
     304           0 :     llvm_unreachable("Opcode has no wide equivalent!");
     305             :   case AArch64::STRBBui:
     306             :     return AArch64::STRHHui;
     307           1 :   case AArch64::STRHHui:
     308             :     return AArch64::STRWui;
     309           0 :   case AArch64::STURBBi:
     310             :     return AArch64::STURHHi;
     311           3 :   case AArch64::STURHHi:
     312             :     return AArch64::STURWi;
     313           3 :   case AArch64::STURWi:
     314             :     return AArch64::STURXi;
     315          14 :   case AArch64::STRWui:
     316             :     return AArch64::STRXui;
     317             :   }
     318             : }
     319             : 
     320        1085 : static unsigned getMatchingPairOpcode(unsigned Opc) {
     321        1085 :   switch (Opc) {
     322           0 :   default:
     323           0 :     llvm_unreachable("Opcode has no pairwise equivalent!");
     324             :   case AArch64::STRSui:
     325             :   case AArch64::STURSi:
     326             :     return AArch64::STPSi;
     327          65 :   case AArch64::STRDui:
     328             :   case AArch64::STURDi:
     329          65 :     return AArch64::STPDi;
     330         155 :   case AArch64::STRQui:
     331             :   case AArch64::STURQi:
     332         155 :     return AArch64::STPQi;
     333         109 :   case AArch64::STRWui:
     334             :   case AArch64::STURWi:
     335         109 :     return AArch64::STPWi;
     336         211 :   case AArch64::STRXui:
     337             :   case AArch64::STURXi:
     338         211 :     return AArch64::STPXi;
     339          57 :   case AArch64::LDRSui:
     340             :   case AArch64::LDURSi:
     341          57 :     return AArch64::LDPSi;
     342         153 :   case AArch64::LDRDui:
     343             :   case AArch64::LDURDi:
     344         153 :     return AArch64::LDPDi;
     345         119 :   case AArch64::LDRQui:
     346             :   case AArch64::LDURQi:
     347         119 :     return AArch64::LDPQi;
     348          44 :   case AArch64::LDRWui:
     349             :   case AArch64::LDURWi:
     350          44 :     return AArch64::LDPWi;
     351         144 :   case AArch64::LDRXui:
     352             :   case AArch64::LDURXi:
     353         144 :     return AArch64::LDPXi;
     354          10 :   case AArch64::LDRSWui:
     355             :   case AArch64::LDURSWi:
     356          10 :     return AArch64::LDPSWi;
     357             :   }
     358             : }
     359             : 
     360         394 : static unsigned isMatchingStore(MachineInstr &LoadInst,
     361             :                                 MachineInstr &StoreInst) {
     362         788 :   unsigned LdOpc = LoadInst.getOpcode();
     363         788 :   unsigned StOpc = StoreInst.getOpcode();
     364         394 :   switch (LdOpc) {
     365           0 :   default:
     366           0 :     llvm_unreachable("Unsupported load instruction!");
     367          18 :   case AArch64::LDRBBui:
     368          18 :     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
     369          18 :            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     370          15 :   case AArch64::LDURBBi:
     371          15 :     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
     372          15 :            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     373          15 :   case AArch64::LDRHHui:
     374          15 :     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
     375          15 :            StOpc == AArch64::STRXui;
     376           7 :   case AArch64::LDURHHi:
     377           7 :     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
     378           7 :            StOpc == AArch64::STURXi;
     379          65 :   case AArch64::LDRWui:
     380          65 :     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
     381           4 :   case AArch64::LDURWi:
     382           4 :     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
     383         267 :   case AArch64::LDRXui:
     384         267 :     return StOpc == AArch64::STRXui;
     385           3 :   case AArch64::LDURXi:
     386           3 :     return StOpc == AArch64::STURXi;
     387             :   }
     388             : }
     389             : 
     390          82 : static unsigned getPreIndexedOpcode(unsigned Opc) {
     391             :   // FIXME: We don't currently support creating pre-indexed loads/stores when
     392             :   // the load or store is the unscaled version.  If we decide to perform such an
     393             :   // optimization in the future the cases for the unscaled loads/stores will
     394             :   // need to be added here.
     395          82 :   switch (Opc) {
     396           0 :   default:
     397           0 :     llvm_unreachable("Opcode has no pre-indexed equivalent!");
     398             :   case AArch64::STRSui:
     399             :     return AArch64::STRSpre;
     400           6 :   case AArch64::STRDui:
     401           6 :     return AArch64::STRDpre;
     402           8 :   case AArch64::STRQui:
     403           8 :     return AArch64::STRQpre;
     404           2 :   case AArch64::STRBBui:
     405           2 :     return AArch64::STRBBpre;
     406           2 :   case AArch64::STRHHui:
     407           2 :     return AArch64::STRHHpre;
     408           8 :   case AArch64::STRWui:
     409           8 :     return AArch64::STRWpre;
     410           9 :   case AArch64::STRXui:
     411           9 :     return AArch64::STRXpre;
     412           6 :   case AArch64::LDRSui:
     413           6 :     return AArch64::LDRSpre;
     414           6 :   case AArch64::LDRDui:
     415           6 :     return AArch64::LDRDpre;
     416           6 :   case AArch64::LDRQui:
     417           6 :     return AArch64::LDRQpre;
     418           2 :   case AArch64::LDRBBui:
     419           2 :     return AArch64::LDRBBpre;
     420           2 :   case AArch64::LDRHHui:
     421           2 :     return AArch64::LDRHHpre;
     422           6 :   case AArch64::LDRWui:
     423           6 :     return AArch64::LDRWpre;
     424           6 :   case AArch64::LDRXui:
     425           6 :     return AArch64::LDRXpre;
     426           0 :   case AArch64::LDRSWui:
     427           0 :     return AArch64::LDRSWpre;
     428           0 :   case AArch64::LDPSi:
     429           0 :     return AArch64::LDPSpre;
     430           0 :   case AArch64::LDPSWi:
     431           0 :     return AArch64::LDPSWpre;
     432           0 :   case AArch64::LDPDi:
     433           0 :     return AArch64::LDPDpre;
     434           0 :   case AArch64::LDPQi:
     435           0 :     return AArch64::LDPQpre;
     436           2 :   case AArch64::LDPWi:
     437           2 :     return AArch64::LDPWpre;
     438           0 :   case AArch64::LDPXi:
     439           0 :     return AArch64::LDPXpre;
     440           0 :   case AArch64::STPSi:
     441           0 :     return AArch64::STPSpre;
     442           0 :   case AArch64::STPDi:
     443           0 :     return AArch64::STPDpre;
     444           1 :   case AArch64::STPQi:
     445           1 :     return AArch64::STPQpre;
     446           2 :   case AArch64::STPWi:
     447           2 :     return AArch64::STPWpre;
     448           2 :   case AArch64::STPXi:
     449           2 :     return AArch64::STPXpre;
     450             :   }
     451             : }
     452             : 
     453          99 : static unsigned getPostIndexedOpcode(unsigned Opc) {
     454          99 :   switch (Opc) {
     455           0 :   default:
     456           0 :     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
     457             :   case AArch64::STRSui:
     458             :   case AArch64::STURSi:
     459             :     return AArch64::STRSpost;
     460           5 :   case AArch64::STRDui:
     461             :   case AArch64::STURDi:
     462           5 :     return AArch64::STRDpost;
     463          17 :   case AArch64::STRQui:
     464             :   case AArch64::STURQi:
     465          17 :     return AArch64::STRQpost;
     466           2 :   case AArch64::STRBBui:
     467           2 :     return AArch64::STRBBpost;
     468           2 :   case AArch64::STRHHui:
     469           2 :     return AArch64::STRHHpost;
     470           7 :   case AArch64::STRWui:
     471             :   case AArch64::STURWi:
     472           7 :     return AArch64::STRWpost;
     473           7 :   case AArch64::STRXui:
     474             :   case AArch64::STURXi:
     475           7 :     return AArch64::STRXpost;
     476           5 :   case AArch64::LDRSui:
     477             :   case AArch64::LDURSi:
     478           5 :     return AArch64::LDRSpost;
     479           6 :   case AArch64::LDRDui:
     480             :   case AArch64::LDURDi:
     481           6 :     return AArch64::LDRDpost;
     482           9 :   case AArch64::LDRQui:
     483             :   case AArch64::LDURQi:
     484           9 :     return AArch64::LDRQpost;
     485           2 :   case AArch64::LDRBBui:
     486           2 :     return AArch64::LDRBBpost;
     487           2 :   case AArch64::LDRHHui:
     488           2 :     return AArch64::LDRHHpost;
     489           5 :   case AArch64::LDRWui:
     490             :   case AArch64::LDURWi:
     491           5 :     return AArch64::LDRWpost;
     492           7 :   case AArch64::LDRXui:
     493             :   case AArch64::LDURXi:
     494           7 :     return AArch64::LDRXpost;
     495           0 :   case AArch64::LDRSWui:
     496           0 :     return AArch64::LDRSWpost;
     497           0 :   case AArch64::LDPSi:
     498           0 :     return AArch64::LDPSpost;
     499           1 :   case AArch64::LDPSWi:
     500           1 :     return AArch64::LDPSWpost;
     501           0 :   case AArch64::LDPDi:
     502           0 :     return AArch64::LDPDpost;
     503           0 :   case AArch64::LDPQi:
     504           0 :     return AArch64::LDPQpost;
     505           0 :   case AArch64::LDPWi:
     506           0 :     return AArch64::LDPWpost;
     507           2 :   case AArch64::LDPXi:
     508           2 :     return AArch64::LDPXpost;
     509           2 :   case AArch64::STPSi:
     510           2 :     return AArch64::STPSpost;
     511           4 :   case AArch64::STPDi:
     512           4 :     return AArch64::STPDpost;
     513           2 :   case AArch64::STPQi:
     514           2 :     return AArch64::STPQpost;
     515           2 :   case AArch64::STPWi:
     516           2 :     return AArch64::STPWpost;
     517           5 :   case AArch64::STPXi:
     518           5 :     return AArch64::STPXpost;
     519             :   }
     520             : }
     521             : 
     522       78223 : static bool isPairedLdSt(const MachineInstr &MI) {
     523      156446 :   switch (MI.getOpcode()) {
     524             :   default:
     525             :     return false;
     526       16478 :   case AArch64::LDPSi:
     527             :   case AArch64::LDPSWi:
     528             :   case AArch64::LDPDi:
     529             :   case AArch64::LDPQi:
     530             :   case AArch64::LDPWi:
     531             :   case AArch64::LDPXi:
     532             :   case AArch64::STPSi:
     533             :   case AArch64::STPDi:
     534             :   case AArch64::STPQi:
     535             :   case AArch64::STPWi:
     536             :   case AArch64::STPXi:
     537       16478 :     return true;
     538             :   }
     539             : }
     540             : 
     541             : static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
     542             :                                           unsigned PairedRegOp = 0) {
     543             :   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
     544       28325 :   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
     545       45167 :   return MI.getOperand(Idx);
     546             : }
     547             : 
     548             : static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
     549       33813 :   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
     550       67626 :   return MI.getOperand(Idx);
     551             : }
     552             : 
     553             : static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
     554       60696 :   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
     555      118837 :   return MI.getOperand(Idx);
     556             : }
     557             : 
     558          94 : static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
     559             :                                   MachineInstr &StoreInst,
     560             :                                   const AArch64InstrInfo *TII) {
     561             :   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
     562          94 :   int LoadSize = getMemScale(LoadInst);
     563          94 :   int StoreSize = getMemScale(StoreInst);
     564          94 :   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
     565          25 :                              ? getLdStOffsetOp(StoreInst).getImm()
     566         188 :                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
     567          94 :   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
     568          25 :                              ? getLdStOffsetOp(LoadInst).getImm()
     569         188 :                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
     570         184 :   return (UnscaledStOffset <= UnscaledLdOffset) &&
     571         184 :          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
     572             : }
     573             : 
     574             : static bool isPromotableZeroStoreInst(MachineInstr &MI) {
     575       67694 :   unsigned Opc = MI.getOpcode();
     576       62188 :   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
     577       63657 :           isNarrowStore(Opc)) &&
     578        1469 :          getLdStRegOp(MI).getReg() == AArch64::WZR;
     579             : }
     580             : 
     581             : MachineBasicBlock::iterator
     582          21 : AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
     583             :                                            MachineBasicBlock::iterator MergeMI,
     584             :                                            const LdStPairFlags &Flags) {
     585             :   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
     586             :          "Expected promotable zero stores.");
     587             : 
     588          21 :   MachineBasicBlock::iterator NextI = I;
     589          21 :   ++NextI;
     590             :   // If NextI is the second of the two instructions to be merged, we need
     591             :   // to skip one further. Either way we merge will invalidate the iterator,
     592             :   // and we don't need to scan the new instruction, as it's a pairwise
     593             :   // instruction, which we're not considering for further action anyway.
     594          21 :   if (NextI == MergeMI)
     595             :     ++NextI;
     596             : 
     597          42 :   unsigned Opc = I->getOpcode();
     598          21 :   bool IsScaled = !TII->isUnscaledLdSt(Opc);
     599          27 :   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
     600             : 
     601          21 :   bool MergeForward = Flags.getMergeForward();
     602             :   // Insert our new paired instruction after whichever of the paired
     603             :   // instructions MergeForward indicates.
     604          21 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
     605             :   // Also based on MergeForward is from where we copy the base register operand
     606             :   // so we get the flags compatible with the input code.
     607             :   const MachineOperand &BaseRegOp =
     608          63 :       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
     609             : 
     610             :   // Which register is Rt and which is Rt2 depends on the offset order.
     611             :   MachineInstr *RtMI;
     612          63 :   if (getLdStOffsetOp(*I).getImm() ==
     613          42 :       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
     614             :     RtMI = &*MergeMI;
     615             :   else
     616          18 :     RtMI = &*I;
     617             : 
     618          21 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     619             :   // Change the scaled offset from small to large type.
     620          21 :   if (IsScaled) {
     621             :     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
     622          15 :     OffsetImm /= 2;
     623             :   }
     624             : 
     625             :   // Construct the new instruction.
     626          63 :   DebugLoc DL = I->getDebugLoc();
     627          21 :   MachineBasicBlock *MBB = I->getParent();
     628          21 :   MachineInstrBuilder MIB;
     629          84 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
     630           4 :             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
     631          21 :             .add(BaseRegOp)
     632          42 :             .addImm(OffsetImm)
     633          63 :             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
     634             :   (void)MIB;
     635             : 
     636             :   DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
     637             :   DEBUG(I->print(dbgs()));
     638             :   DEBUG(dbgs() << "    ");
     639             :   DEBUG(MergeMI->print(dbgs()));
     640             :   DEBUG(dbgs() << "  with instruction:\n    ");
     641             :   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     642             :   DEBUG(dbgs() << "\n");
     643             : 
     644             :   // Erase the old instructions.
     645          21 :   I->eraseFromParent();
     646          21 :   MergeMI->eraseFromParent();
     647          42 :   return NextI;
     648             : }
     649             : 
     650             : MachineBasicBlock::iterator
     651         837 : AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     652             :                                       MachineBasicBlock::iterator Paired,
     653             :                                       const LdStPairFlags &Flags) {
     654         837 :   MachineBasicBlock::iterator NextI = I;
     655         837 :   ++NextI;
     656             :   // If NextI is the second of the two instructions to be merged, we need
     657             :   // to skip one further. Either way we merge will invalidate the iterator,
     658             :   // and we don't need to scan the new instruction, as it's a pairwise
     659             :   // instruction, which we're not considering for further action anyway.
     660         837 :   if (NextI == Paired)
     661             :     ++NextI;
     662             : 
     663         837 :   int SExtIdx = Flags.getSExtIdx();
     664             :   unsigned Opc =
     665        1680 :       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
     666         837 :   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
     667         875 :   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
     668             : 
     669         837 :   bool MergeForward = Flags.getMergeForward();
     670             :   // Insert our new paired instruction after whichever of the paired
     671             :   // instructions MergeForward indicates.
     672         837 :   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
     673             :   // Also based on MergeForward is from where we copy the base register operand
     674             :   // so we get the flags compatible with the input code.
     675             :   const MachineOperand &BaseRegOp =
     676        2511 :       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
     677             : 
     678        1674 :   int Offset = getLdStOffsetOp(*I).getImm();
     679        1674 :   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
     680        2511 :   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
     681         837 :   if (IsUnscaled != PairedIsUnscaled) {
     682             :     // We're trying to pair instructions that differ in how they are scaled.  If
     683             :     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
     684             :     // the opposite (i.e., make Paired's offset unscaled).
     685           8 :     int MemSize = getMemScale(*Paired);
     686           8 :     if (PairedIsUnscaled) {
     687             :       // If the unscaled offset isn't a multiple of the MemSize, we can't
     688             :       // pair the operations together.
     689             :       assert(!(PairedOffset % getMemScale(*Paired)) &&
     690             :              "Offset should be a multiple of the stride!");
     691           4 :       PairedOffset /= MemSize;
     692             :     } else {
     693           4 :       PairedOffset *= MemSize;
     694             :     }
     695             :   }
     696             : 
     697             :   // Which register is Rt and which is Rt2 depends on the offset order.
     698             :   MachineInstr *RtMI, *Rt2MI;
     699         837 :   if (Offset == PairedOffset + OffsetStride) {
     700         241 :     RtMI = &*Paired;
     701         241 :     Rt2MI = &*I;
     702             :     // Here we swapped the assumption made for SExtIdx.
     703             :     // I.e., we turn ldp I, Paired into ldp Paired, I.
     704             :     // Update the index accordingly.
     705         241 :     if (SExtIdx != -1)
     706           0 :       SExtIdx = (SExtIdx + 1) % 2;
     707             :   } else {
     708             :     RtMI = &*I;
     709             :     Rt2MI = &*Paired;
     710             :   }
     711         837 :   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
     712             :   // Scale the immediate offset, if necessary.
     713        1674 :   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
     714             :     assert(!(OffsetImm % getMemScale(*RtMI)) &&
     715             :            "Unscaled offset cannot be scaled.");
     716          42 :     OffsetImm /= getMemScale(*RtMI);
     717             :   }
     718             : 
     719             :   // Construct the new instruction.
     720         837 :   MachineInstrBuilder MIB;
     721        2511 :   DebugLoc DL = I->getDebugLoc();
     722         837 :   MachineBasicBlock *MBB = I->getParent();
     723         837 :   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
     724         837 :   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
     725             :   // Kill flags may become invalid when moving stores for pairing.
     726         837 :   if (RegOp0.isUse()) {
     727         412 :     if (!MergeForward) {
     728             :       // Clear kill flags on store if moving upwards. Example:
     729             :       //   STRWui %w0, ...
     730             :       //   USE %w1
     731             :       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
     732         384 :       RegOp0.setIsKill(false);
     733             :       RegOp1.setIsKill(false);
     734             :     } else {
     735             :       // Clear kill flags of the first stores register. Example:
     736             :       //   STRWui %w1, ...
     737             :       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
     738             :       //   STRW %w0
     739          56 :       unsigned Reg = getLdStRegOp(*I).getReg();
     740         222 :       for (MachineInstr &MI : make_range(std::next(I), Paired))
     741          69 :         MI.clearRegisterKills(Reg, TRI);
     742             :     }
     743             :   }
     744        2511 :   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
     745         837 :             .add(RegOp0)
     746         837 :             .add(RegOp1)
     747         837 :             .add(BaseRegOp)
     748        1674 :             .addImm(OffsetImm)
     749        2511 :             .setMemRefs(I->mergeMemRefsWith(*Paired));
     750             : 
     751             :   (void)MIB;
     752             : 
     753             :   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
     754             :   DEBUG(I->print(dbgs()));
     755             :   DEBUG(dbgs() << "    ");
     756             :   DEBUG(Paired->print(dbgs()));
     757             :   DEBUG(dbgs() << "  with instruction:\n    ");
     758         837 :   if (SExtIdx != -1) {
     759             :     // Generate the sign extension for the proper result of the ldp.
     760             :     // I.e., with X1, that would be:
     761             :     // %W1<def> = KILL %W1, %X1<imp-def>
     762             :     // %X1<def> = SBFMXri %X1<kill>, 0, 31
     763          12 :     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
     764             :     // Right now, DstMO has the extended register, since it comes from an
     765             :     // extended opcode.
     766           6 :     unsigned DstRegX = DstMO.getReg();
     767             :     // Get the W variant of that register.
     768           6 :     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
     769             :     // Update the result of LDP to use the W instead of the X variant.
     770           6 :     DstMO.setReg(DstRegW);
     771             :     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     772             :     DEBUG(dbgs() << "\n");
     773             :     // Make the machine verifier happy by providing a definition for
     774             :     // the X register.
     775             :     // Insert this definition right after the generated LDP, i.e., before
     776             :     // InsertionPoint.
     777             :     MachineInstrBuilder MIBKill =
     778          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
     779           6 :             .addReg(DstRegW)
     780           6 :             .addReg(DstRegX, RegState::Define);
     781          12 :     MIBKill->getOperand(2).setImplicit();
     782             :     // Create the sign extension.
     783             :     MachineInstrBuilder MIBSXTW =
     784          18 :         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
     785           6 :             .addReg(DstRegX)
     786           6 :             .addImm(0)
     787           6 :             .addImm(31);
     788             :     (void)MIBSXTW;
     789             :     DEBUG(dbgs() << "  Extend operand:\n    ");
     790             :     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
     791             :   } else {
     792             :     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
     793             :   }
     794             :   DEBUG(dbgs() << "\n");
     795             : 
     796             :   // Erase the old instructions.
     797         837 :   I->eraseFromParent();
     798         837 :   Paired->eraseFromParent();
     799             : 
     800        1674 :   return NextI;
     801             : }
     802             : 
     803             : MachineBasicBlock::iterator
     804          61 : AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     805             :                                           MachineBasicBlock::iterator StoreI) {
     806          61 :   MachineBasicBlock::iterator NextI = LoadI;
     807          61 :   ++NextI;
     808             : 
     809          61 :   int LoadSize = getMemScale(*LoadI);
     810          61 :   int StoreSize = getMemScale(*StoreI);
     811         122 :   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
     812         122 :   const MachineOperand &StMO = getLdStRegOp(*StoreI);
     813         122 :   unsigned StRt = getLdStRegOp(*StoreI).getReg();
     814         183 :   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
     815             : 
     816             :   assert((IsStoreXReg ||
     817             :           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
     818             :          "Unexpected RegClass");
     819             : 
     820             :   MachineInstr *BitExtMI;
     821          61 :   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
     822             :     // Remove the load, if the destination register of the loads is the same
     823             :     // register for stored value.
     824          11 :     if (StRt == LdRt && LoadSize == 8) {
     825           1 :       for (MachineInstr &MI : make_range(StoreI->getIterator(),
     826           7 :                                          LoadI->getIterator())) {
     827           4 :         if (MI.killsRegister(StRt, TRI)) {
     828           1 :           MI.clearRegisterKills(StRt, TRI);
     829           1 :           break;
     830             :         }
     831             :       }
     832             :       DEBUG(dbgs() << "Remove load instruction:\n    ");
     833             :       DEBUG(LoadI->print(dbgs()));
     834             :       DEBUG(dbgs() << "\n");
     835           1 :       LoadI->eraseFromParent();
     836           1 :       return NextI;
     837             :     }
     838             :     // Replace the load with a mov if the load and store are in the same size.
     839          10 :     BitExtMI =
     840          20 :         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     841          40 :                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
     842          10 :             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
     843          10 :             .add(StMO)
     844          30 :             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     845             :   } else {
     846             :     // FIXME: Currently we disable this transformation in big-endian targets as
     847             :     // performance and correctness are verified only in little-endian.
     848          50 :     if (!Subtarget->isLittleEndian())
     849           0 :       return NextI;
     850         100 :     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
     851             :     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
     852             :            "Unsupported ld/st match");
     853             :     assert(LoadSize <= StoreSize && "Invalid load size");
     854             :     int UnscaledLdOffset = IsUnscaled
     855          48 :                                ? getLdStOffsetOp(*LoadI).getImm()
     856         126 :                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
     857             :     int UnscaledStOffset = IsUnscaled
     858          48 :                                ? getLdStOffsetOp(*StoreI).getImm()
     859         126 :                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     860          50 :     int Width = LoadSize * 8;
     861          50 :     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     862          50 :     int Imms = Immr + Width - 1;
     863             :     unsigned DestReg = IsStoreXReg
     864          78 :                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
     865             :                                                       &AArch64::GPR64RegClass)
     866          50 :                            : LdRt;
     867             : 
     868             :     assert((UnscaledLdOffset >= UnscaledStOffset &&
     869             :             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
     870             :            "Invalid offset");
     871             : 
     872          50 :     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     873          50 :     Imms = Immr + Width - 1;
     874          50 :     if (UnscaledLdOffset == UnscaledStOffset) {
     875          32 :       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
     876          16 :                                 | ((Immr) << 6)               // immr
     877          16 :                                 | ((Imms) << 0)               // imms
     878             :           ;
     879             : 
     880          16 :       BitExtMI =
     881          16 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     882          16 :                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
     883          64 :                   DestReg)
     884          16 :               .add(StMO)
     885          32 :               .addImm(AndMaskEncoded);
     886             :     } else {
     887          34 :       BitExtMI =
     888          34 :           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
     889          34 :                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
     890         136 :                   DestReg)
     891          34 :               .add(StMO)
     892          68 :               .addImm(Immr)
     893          68 :               .addImm(Imms);
     894             :     }
     895             :   }
     896             : 
     897             :   // Clear kill flags between store and load.
     898          60 :   for (MachineInstr &MI : make_range(StoreI->getIterator(),
     899         362 :                                      BitExtMI->getIterator()))
     900         124 :     if (MI.killsRegister(StRt, TRI)) {
     901          58 :       MI.clearRegisterKills(StRt, TRI);
     902          58 :       break;
     903             :     }
     904             : 
     905             :   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
     906             :   DEBUG(StoreI->print(dbgs()));
     907             :   DEBUG(dbgs() << "    ");
     908             :   DEBUG(LoadI->print(dbgs()));
     909             :   DEBUG(dbgs() << "  with instructions:\n    ");
     910             :   DEBUG(StoreI->print(dbgs()));
     911             :   DEBUG(dbgs() << "    ");
     912             :   DEBUG((BitExtMI)->print(dbgs()));
     913             :   DEBUG(dbgs() << "\n");
     914             : 
     915             :   // Erase the old instructions.
     916          60 :   LoadI->eraseFromParent();
     917          60 :   return NextI;
     918             : }
     919             : 
     920             : /// trackRegDefsUses - Remember what registers the specified instruction uses
     921             : /// and modifies.
     922       48037 : static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
     923             :                              BitVector &UsedRegs,
     924             :                              const TargetRegisterInfo *TRI) {
     925      208728 :   for (const MachineOperand &MO : MI.operands()) {
     926      160691 :     if (MO.isRegMask())
     927         919 :       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
     928             : 
     929      160691 :     if (!MO.isReg())
     930       41192 :       continue;
     931      119499 :     unsigned Reg = MO.getReg();
     932      119499 :     if (!Reg)
     933           2 :       continue;
     934      119497 :     if (MO.isDef()) {
     935             :       // WZR/XZR are not modified even when used as a destination register.
     936       43491 :       if (Reg != AArch64::WZR && Reg != AArch64::XZR)
     937     1409338 :         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     938     1322834 :           ModifiedRegs.set(*AI);
     939             :     } else {
     940             :       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
     941     2190450 :       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     942     2038438 :         UsedRegs.set(*AI);
     943             :     }
     944             :   }
     945       48037 : }
     946             : 
     947             : static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
     948             :   // Convert the byte-offset used by unscaled into an "element" offset used
     949             :   // by the scaled pair load/store instructions.
     950        6518 :   if (IsUnscaled) {
     951             :     // If the byte-offset isn't a multiple of the stride, there's no point
     952             :     // trying to match it.
     953         269 :     if (Offset % OffsetStride)
     954             :       return false;
     955         194 :     Offset /= OffsetStride;
     956             :   }
     957        6443 :   return Offset <= 63 && Offset >= -64;
     958             : }
     959             : 
     960             : // Do alignment, specialized to power of 2 and for signed ints,
     961             : // avoiding having to do a C-style cast from uint_64t to int when
     962             : // using alignTo from include/llvm/Support/MathExtras.h.
     963             : // FIXME: Move this function to include/MathExtras.h?
     964             : static int alignTo(int Num, int PowOf2) {
     965          84 :   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
     966             : }
     967             : 
     968         465 : static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
     969             :                      AliasAnalysis *AA) {
     970             :   // One of the instructions must modify memory.
     971         465 :   if (!MIa.mayStore() && !MIb.mayStore())
     972             :     return false;
     973             : 
     974             :   // Both instructions must be memory operations.
     975         438 :   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     976             :     return false;
     977             : 
     978         438 :   return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
     979             : }
     980             : 
     981             : static bool mayAlias(MachineInstr &MIa,
     982             :                      SmallVectorImpl<MachineInstr *> &MemInsns,
     983             :                      AliasAnalysis *AA) {
     984        2788 :   for (MachineInstr *MIb : MemInsns)
     985         152 :     if (mayAlias(MIa, *MIb, AA))
     986             :       return true;
     987             : 
     988             :   return false;
     989             : }
     990             : 
     991        1232 : bool AArch64LoadStoreOpt::findMatchingStore(
     992             :     MachineBasicBlock::iterator I, unsigned Limit,
     993             :     MachineBasicBlock::iterator &StoreI) {
     994        2464 :   MachineBasicBlock::iterator B = I->getParent()->begin();
     995        1232 :   MachineBasicBlock::iterator MBBI = I;
     996        1232 :   MachineInstr &LoadMI = *I;
     997        1232 :   unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
     998             : 
     999             :   // If the load is the first instruction in the block, there's obviously
    1000             :   // not any matching store.
    1001        1232 :   if (MBBI == B)
    1002             :     return false;
    1003             : 
    1004             :   // Track which registers have been modified and used between the first insn
    1005             :   // and the second insn.
    1006        1802 :   ModifiedRegs.reset();
    1007         901 :   UsedRegs.reset();
    1008             : 
    1009             :   unsigned Count = 0;
    1010             :   do {
    1011        3215 :     --MBBI;
    1012        3215 :     MachineInstr &MI = *MBBI;
    1013             : 
    1014             :     // Don't count transient instructions towards the search limit since there
    1015             :     // may be different numbers of them if e.g. debug information is present.
    1016             :     if (!MI.isTransient())
    1017        2942 :       ++Count;
    1018             : 
    1019             :     // If the load instruction reads directly from the address to which the
    1020             :     // store instruction writes and the stored value is not modified, we can
    1021             :     // promote the load. Since we do not handle stores with pre-/post-index,
    1022             :     // it's unnecessary to check if BaseReg is modified by the store itself.
    1023        3897 :     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
    1024         382 :         BaseReg == getLdStBaseOp(MI).getReg() &&
    1025        3381 :         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
    1026         144 :         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
    1027          61 :       StoreI = MBBI;
    1028          61 :       return true;
    1029             :     }
    1030             : 
    1031        3154 :     if (MI.isCall())
    1032             :       return false;
    1033             : 
    1034             :     // Update modified / uses register lists.
    1035        2937 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1036             : 
    1037             :     // Otherwise, if the base register is modified, we have no match, so
    1038             :     // return early.
    1039        8811 :     if (ModifiedRegs[BaseReg])
    1040             :       return false;
    1041             : 
    1042             :     // If we encounter a store aliased with the load, return early.
    1043        2606 :     if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
    1044             :       return false;
    1045        2456 :   } while (MBBI != B && Count < Limit);
    1046             :   return false;
    1047             : }
    1048             : 
    1049             : // Returns true if FirstMI and MI are candidates for merging or pairing.
    1050             : // Otherwise, returns false.
    1051       16130 : static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
    1052             :                                        LdStPairFlags &Flags,
    1053             :                                        const AArch64InstrInfo *TII) {
    1054             :   // If this is volatile or if pairing is suppressed, not a candidate.
    1055       16130 :   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
    1056             :     return false;
    1057             : 
    1058             :   // We should have already checked FirstMI for pair suppression and volatility.
    1059             :   assert(!FirstMI.hasOrderedMemoryRef() &&
    1060             :          !TII->isLdStPairSuppressed(FirstMI) &&
    1061             :          "FirstMI shouldn't get here if either of these checks are true.");
    1062             : 
    1063       23450 :   unsigned OpcA = FirstMI.getOpcode();
    1064       23450 :   unsigned OpcB = MI.getOpcode();
    1065             : 
    1066             :   // Opcodes match: nothing more to check.
    1067       11725 :   if (OpcA == OpcB)
    1068             :     return true;
    1069             : 
    1070             :   // Try to match a sign-extended load/store with a zero-extended load/store.
    1071             :   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
    1072        9215 :   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
    1073             :   assert(IsValidLdStrOpc &&
    1074             :          "Given Opc should be a Load or Store with an immediate");
    1075             :   // OpcA will be the first instruction in the pair.
    1076        9215 :   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
    1077          12 :     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
    1078           6 :     return true;
    1079             :   }
    1080             : 
    1081             :   // If the second instruction isn't even a mergable/pairable load/store, bail
    1082             :   // out.
    1083        9209 :   if (!PairIsValidLdStrOpc)
    1084             :     return false;
    1085             : 
    1086             :   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
    1087             :   // offsets.
    1088             :   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
    1089             :     return false;
    1090             : 
    1091             :   // Try to match an unscaled load/store with a scaled load/store.
    1092        2191 :   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
    1093         124 :          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
    1094             : 
    1095             :   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
    1096             : }
    1097             : 
    1098             : /// Scan the instructions looking for a load/store that can be combined with the
    1099             : /// current instruction into a wider equivalent or a load/store pair.
    1100             : MachineBasicBlock::iterator
    1101        5506 : AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    1102             :                                       LdStPairFlags &Flags, unsigned Limit,
    1103             :                                       bool FindNarrowMerge) {
    1104       11012 :   MachineBasicBlock::iterator E = I->getParent()->end();
    1105        5506 :   MachineBasicBlock::iterator MBBI = I;
    1106        5506 :   MachineInstr &FirstMI = *I;
    1107        5506 :   ++MBBI;
    1108             : 
    1109        5506 :   bool MayLoad = FirstMI.mayLoad();
    1110        5506 :   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
    1111        5506 :   unsigned Reg = getLdStRegOp(FirstMI).getReg();
    1112        5506 :   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
    1113        5506 :   int Offset = getLdStOffsetOp(FirstMI).getImm();
    1114        5506 :   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
    1115        5506 :   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
    1116             : 
    1117             :   // Track which registers have been modified and used between the first insn
    1118             :   // (inclusive) and the second insn.
    1119       11012 :   ModifiedRegs.reset();
    1120       11012 :   UsedRegs.reset();
    1121             : 
    1122             :   // Remember any instructions that read/write memory between FirstMI and MI.
    1123       11012 :   SmallVector<MachineInstr *, 4> MemInsns;
    1124             : 
    1125       24718 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1126       16130 :     MachineInstr &MI = *MBBI;
    1127             : 
    1128             :     // Don't count transient instructions towards the search limit since there
    1129             :     // may be different numbers of them if e.g. debug information is present.
    1130             :     if (!MI.isTransient())
    1131       15441 :       ++Count;
    1132             : 
    1133       16130 :     Flags.setSExtIdx(-1);
    1134       18693 :     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
    1135        5126 :         getLdStOffsetOp(MI).isImm()) {
    1136             :       assert(MI.mayLoadOrStore() && "Expected memory operation.");
    1137             :       // If we've found another instruction with the same opcode, check to see
    1138             :       // if the base and offset are compatible with our starting instruction.
    1139             :       // These instructions all have scaled immediate operands, so we just
    1140             :       // check for +1/-1. Make sure to check the new instruction offset is
    1141             :       // actually an immediate and not a symbolic reference destined for
    1142             :       // a relocation.
    1143        2555 :       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
    1144        2555 :       int MIOffset = getLdStOffsetOp(MI).getImm();
    1145        2555 :       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
    1146        2555 :       if (IsUnscaled != MIIsUnscaled) {
    1147             :         // We're trying to pair instructions that differ in how they are scaled.
    1148             :         // If FirstMI is scaled then scale the offset of MI accordingly.
    1149             :         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
    1150          47 :         int MemSize = getMemScale(MI);
    1151          47 :         if (MIIsUnscaled) {
    1152             :           // If the unscaled offset isn't a multiple of the MemSize, we can't
    1153             :           // pair the operations together: bail and keep looking.
    1154          10 :           if (MIOffset % MemSize) {
    1155           2 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1156           2 :             MemInsns.push_back(&MI);
    1157           2 :             continue;
    1158             :           }
    1159           6 :           MIOffset /= MemSize;
    1160             :         } else {
    1161          39 :           MIOffset *= MemSize;
    1162             :         }
    1163             :       }
    1164             : 
    1165        3859 :       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
    1166        1306 :                                    (Offset + OffsetStride == MIOffset))) {
    1167        1031 :         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
    1168        1031 :         if (FindNarrowMerge) {
    1169             :           // If the alignment requirements of the scaled wide load/store
    1170             :           // instruction can't express the offset of the scaled narrow input,
    1171             :           // bail and keep looking. For promotable zero stores, allow only when
    1172             :           // the stored value is the same (i.e., WZR).
    1173          66 :           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
    1174          30 :               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
    1175          10 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1176          10 :             MemInsns.push_back(&MI);
    1177          10 :             continue;
    1178             :           }
    1179             :         } else {
    1180             :           // Pairwise instructions have a 7-bit signed offset field. Single
    1181             :           // insns have a 12-bit unsigned offset field.  If the resultant
    1182             :           // immediate offset of merging these instructions is out of range for
    1183             :           // a pairwise instruction, bail and keep looking.
    1184        1000 :           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
    1185           0 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1186           0 :             MemInsns.push_back(&MI);
    1187           0 :             continue;
    1188             :           }
    1189             :           // If the alignment requirements of the paired (scaled) instruction
    1190             :           // can't express the offset of the unscaled input, bail and keep
    1191             :           // looking.
    1192        1059 :           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
    1193           0 :             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1194           0 :             MemInsns.push_back(&MI);
    1195           0 :             continue;
    1196             :           }
    1197             :         }
    1198             :         // If the destination register of the loads is the same register, bail
    1199             :         // and keep looking. A load-pair instruction with both destination
    1200             :         // registers the same is UNPREDICTABLE and will result in an exception.
    1201        1573 :         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
    1202          36 :           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1203          36 :           MemInsns.push_back(&MI);
    1204          36 :           continue;
    1205             :         }
    1206             : 
    1207             :         // If the Rt of the second instruction was not modified or used between
    1208             :         // the two instructions and none of the instructions between the second
    1209             :         // and first alias with the second, we can combine the second into the
    1210             :         // first.
    1211        2845 :         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
    1212        4075 :             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
    1213        1682 :             !mayAlias(MI, MemInsns, AA)) {
    1214         824 :           Flags.setMergeForward(false);
    1215         824 :           return MBBI;
    1216             :         }
    1217             : 
    1218             :         // Likewise, if the Rt of the first instruction is not modified or used
    1219             :         // between the two instructions and none of the instructions between the
    1220             :         // first and the second alias with the first, we can combine the first
    1221             :         // into the second.
    1222         403 :         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
    1223         326 :             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
    1224          96 :             !mayAlias(FirstMI, MemInsns, AA)) {
    1225          34 :           Flags.setMergeForward(true);
    1226          34 :           return MBBI;
    1227             :         }
    1228             :         // Unable to combine these instructions due to interference in between.
    1229             :         // Keep looking.
    1230             :       }
    1231             :     }
    1232             : 
    1233             :     // If the instruction wasn't a matching load or store.  Stop searching if we
    1234             :     // encounter a call instruction that might modify memory.
    1235       15224 :     if (MI.isCall())
    1236         822 :       return E;
    1237             : 
    1238             :     // Update modified / uses register lists.
    1239       14402 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1240             : 
    1241             :     // Otherwise, if the base register is modified, we have no match, so
    1242             :     // return early.
    1243       43206 :     if (ModifiedRegs[BaseReg])
    1244         744 :       return E;
    1245             : 
    1246             :     // Update list of instructions that read/write memory.
    1247       13658 :     if (MI.mayLoadOrStore())
    1248        4328 :       MemInsns.push_back(&MI);
    1249             :   }
    1250        3082 :   return E;
    1251             : }
    1252             : 
    1253             : MachineBasicBlock::iterator
    1254         181 : AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
    1255             :                                      MachineBasicBlock::iterator Update,
    1256             :                                      bool IsPreIdx) {
    1257             :   assert((Update->getOpcode() == AArch64::ADDXri ||
    1258             :           Update->getOpcode() == AArch64::SUBXri) &&
    1259             :          "Unexpected base register update instruction to merge!");
    1260         181 :   MachineBasicBlock::iterator NextI = I;
    1261             :   // Return the instruction following the merged instruction, which is
    1262             :   // the instruction following our unmerged load. Unless that's the add/sub
    1263             :   // instruction we're merging, in which case it's the one after that.
    1264         362 :   if (++NextI == Update)
    1265             :     ++NextI;
    1266             : 
    1267         181 :   int Value = Update->getOperand(2).getImm();
    1268             :   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
    1269             :          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
    1270         362 :   if (Update->getOpcode() == AArch64::SUBXri)
    1271          43 :     Value = -Value;
    1272             : 
    1273         444 :   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
    1274         280 :                              : getPostIndexedOpcode(I->getOpcode());
    1275         181 :   MachineInstrBuilder MIB;
    1276         181 :   if (!isPairedLdSt(*I)) {
    1277             :     // Non-paired instruction.
    1278         936 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1279         468 :               .add(getLdStRegOp(*Update))
    1280         468 :               .add(getLdStRegOp(*I))
    1281         468 :               .add(getLdStBaseOp(*I))
    1282         312 :               .addImm(Value)
    1283         624 :               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
    1284             :   } else {
    1285             :     // Paired instruction.
    1286          25 :     int Scale = getMemScale(*I);
    1287         150 :     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    1288          75 :               .add(getLdStRegOp(*Update))
    1289          75 :               .add(getLdStRegOp(*I, 0))
    1290          75 :               .add(getLdStRegOp(*I, 1))
    1291          75 :               .add(getLdStBaseOp(*I))
    1292          50 :               .addImm(Value / Scale)
    1293         100 :               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
    1294             :   }
    1295             :   (void)MIB;
    1296             : 
    1297             :   if (IsPreIdx)
    1298             :     DEBUG(dbgs() << "Creating pre-indexed load/store.");
    1299             :   else
    1300             :     DEBUG(dbgs() << "Creating post-indexed load/store.");
    1301             :   DEBUG(dbgs() << "    Replacing instructions:\n    ");
    1302             :   DEBUG(I->print(dbgs()));
    1303             :   DEBUG(dbgs() << "    ");
    1304             :   DEBUG(Update->print(dbgs()));
    1305             :   DEBUG(dbgs() << "  with instruction:\n    ");
    1306             :   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    1307             :   DEBUG(dbgs() << "\n");
    1308             : 
    1309             :   // Erase the old instructions for the block.
    1310         181 :   I->eraseFromParent();
    1311         181 :   Update->eraseFromParent();
    1312             : 
    1313         181 :   return NextI;
    1314             : }
    1315             : 
    1316       30831 : bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
    1317             :                                                MachineInstr &MI,
    1318             :                                                unsigned BaseReg, int Offset) {
    1319       61662 :   switch (MI.getOpcode()) {
    1320             :   default:
    1321             :     break;
    1322        1430 :   case AArch64::SUBXri:
    1323             :   case AArch64::ADDXri:
    1324             :     // Make sure it's a vanilla immediate operand, not a relocation or
    1325             :     // anything else we can't handle.
    1326        2860 :     if (!MI.getOperand(2).isImm())
    1327             :       break;
    1328             :     // Watch out for 1 << 12 shifted value.
    1329        2624 :     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
    1330             :       break;
    1331             : 
    1332             :     // The update instruction source and destination register must be the
    1333             :     // same as the load/store base register.
    1334        1869 :     if (MI.getOperand(0).getReg() != BaseReg ||
    1335         573 :         MI.getOperand(1).getReg() != BaseReg)
    1336             :       break;
    1337             : 
    1338         500 :     bool IsPairedInsn = isPairedLdSt(MemMI);
    1339         500 :     int UpdateOffset = MI.getOperand(2).getImm();
    1340         500 :     if (MI.getOpcode() == AArch64::SUBXri)
    1341          45 :       UpdateOffset = -UpdateOffset;
    1342             : 
    1343             :     // For non-paired load/store instructions, the immediate must fit in a
    1344             :     // signed 9-bit integer.
    1345         500 :     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
    1346             :       break;
    1347             : 
    1348             :     // For paired load/store instructions, the immediate must be a multiple of
    1349             :     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
    1350             :     // integer.
    1351         491 :     if (IsPairedInsn) {
    1352         163 :       int Scale = getMemScale(MemMI);
    1353         163 :       if (UpdateOffset % Scale != 0)
    1354             :         break;
    1355             : 
    1356         163 :       int ScaledOffset = UpdateOffset / Scale;
    1357         163 :       if (ScaledOffset > 63 || ScaledOffset < -64)
    1358             :         break;
    1359             :     }
    1360             : 
    1361             :     // If we have a non-zero Offset, we check that it matches the amount
    1362             :     // we're adding to the register.
    1363         483 :     if (!Offset || Offset == UpdateOffset)
    1364             :       return true;
    1365             :     break;
    1366             :   }
    1367             :   return false;
    1368             : }
    1369             : 
    1370       15556 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
    1371             :     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
    1372       31112 :   MachineBasicBlock::iterator E = I->getParent()->end();
    1373       15556 :   MachineInstr &MemMI = *I;
    1374       15556 :   MachineBasicBlock::iterator MBBI = I;
    1375             : 
    1376       15556 :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1377       15556 :   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
    1378             : 
    1379             :   // Scan forward looking for post-index opportunities.  Updating instructions
    1380             :   // can't be formed if the memory instruction doesn't have the offset we're
    1381             :   // looking for.
    1382       15556 :   if (MIUnscaledOffset != UnscaledOffset)
    1383        3986 :     return E;
    1384             : 
    1385             :   // If the base register overlaps a destination register, we can't
    1386             :   // merge the update.
    1387             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1388       39176 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1389       14062 :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1390       27994 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1391         259 :       return E;
    1392             :   }
    1393             : 
    1394             :   // Track which registers have been modified and used between the first insn
    1395             :   // (inclusive) and the second insn.
    1396       22622 :   ModifiedRegs.reset();
    1397       22622 :   UsedRegs.reset();
    1398       11311 :   ++MBBI;
    1399       42697 :   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    1400       25420 :     MachineInstr &MI = *MBBI;
    1401             : 
    1402             :     // Don't count transient instructions towards the search limit since there
    1403             :     // may be different numbers of them if e.g. debug information is present.
    1404             :     if (!MI.isTransient())
    1405       24204 :       ++Count;
    1406             : 
    1407             :     // If we found a match, return it.
    1408       25420 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
    1409         136 :       return MBBI;
    1410             : 
    1411             :     // Update the status of what the instruction clobbered and used.
    1412       25284 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1413             : 
    1414             :     // Otherwise, if the base register is used or modified, we have no match, so
    1415             :     // return early.
    1416      123132 :     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    1417        5209 :       return E;
    1418             :   }
    1419        5966 :   return E;
    1420             : }
    1421             : 
    1422        7637 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
    1423             :     MachineBasicBlock::iterator I, unsigned Limit) {
    1424       15274 :   MachineBasicBlock::iterator B = I->getParent()->begin();
    1425       15274 :   MachineBasicBlock::iterator E = I->getParent()->end();
    1426        7637 :   MachineInstr &MemMI = *I;
    1427        7637 :   MachineBasicBlock::iterator MBBI = I;
    1428             : 
    1429        7637 :   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
    1430        7637 :   int Offset = getLdStOffsetOp(MemMI).getImm();
    1431             : 
    1432             :   // If the load/store is the first instruction in the block, there's obviously
    1433             :   // not any matching update. Ditto if the memory offset isn't zero.
    1434        7637 :   if (MBBI == B || Offset != 0)
    1435        5098 :     return E;
    1436             :   // If the base register overlaps a destination register, we can't
    1437             :   // merge the update.
    1438             :   bool IsPairedInsn = isPairedLdSt(MemMI);
    1439        7897 :   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
    1440        2755 :     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
    1441        5464 :     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    1442          76 :       return E;
    1443             :   }
    1444             : 
    1445             :   // Track which registers have been modified and used between the first insn
    1446             :   // (inclusive) and the second insn.
    1447        4926 :   ModifiedRegs.reset();
    1448        2463 :   UsedRegs.reset();
    1449             :   unsigned Count = 0;
    1450             :   do {
    1451        5411 :     --MBBI;
    1452        5411 :     MachineInstr &MI = *MBBI;
    1453             : 
    1454             :     // Don't count transient instructions towards the search limit since there
    1455             :     // may be different numbers of them if e.g. debug information is present.
    1456             :     if (!MI.isTransient())
    1457        4739 :       ++Count;
    1458             : 
    1459             :     // If we found a match, return it.
    1460        5411 :     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
    1461          45 :       return MBBI;
    1462             : 
    1463             :     // Update the status of what the instruction clobbered and used.
    1464        5366 :     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    1465             : 
    1466             :     // Otherwise, if the base register is used or modified, we have no match, so
    1467             :     // return early.
    1468       25932 :     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    1469         996 :       return E;
    1470        7318 :   } while (MBBI != B && Count < Limit);
    1471        1422 :   return E;
    1472             : }
    1473             : 
    1474        2154 : bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
    1475             :     MachineBasicBlock::iterator &MBBI) {
    1476        2154 :   MachineInstr &MI = *MBBI;
    1477             :   // If this is a volatile load, don't mess with it.
    1478        2154 :   if (MI.hasOrderedMemoryRef())
    1479             :     return false;
    1480             : 
    1481             :   // Make sure this is a reg+imm.
    1482             :   // FIXME: It is possible to extend it to handle reg+reg cases.
    1483        2996 :   if (!getLdStOffsetOp(MI).isImm())
    1484             :     return false;
    1485             : 
    1486             :   // Look backward up to LdStLimit instructions.
    1487        1232 :   MachineBasicBlock::iterator StoreI;
    1488        1232 :   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
    1489          61 :     ++NumLoadsFromStoresPromoted;
    1490             :     // Promote the load. Keeping the iterator straight is a
    1491             :     // pain, so we let the merge routine tell us what the next instruction
    1492             :     // is after it's done mucking about.
    1493          61 :     MBBI = promoteLoadFromStore(MBBI, StoreI);
    1494          61 :     return true;
    1495             :   }
    1496             :   return false;
    1497             : }
    1498             : 
    1499             : // Merge adjacent zero stores into a wider store.
    1500         105 : bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
    1501             :     MachineBasicBlock::iterator &MBBI) {
    1502             :   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
    1503         105 :   MachineInstr &MI = *MBBI;
    1504         210 :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1505             : 
    1506         105 :   if (!TII->isCandidateToMergeOrPair(MI))
    1507             :     return false;
    1508             : 
    1509             :   // Look ahead up to LdStLimit instructions for a mergable instruction.
    1510          96 :   LdStPairFlags Flags;
    1511             :   MachineBasicBlock::iterator MergeMI =
    1512          96 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
    1513          96 :   if (MergeMI != E) {
    1514          21 :     ++NumZeroStoresPromoted;
    1515             : 
    1516             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1517             :     // us what the next instruction is after it's done mucking about.
    1518          21 :     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
    1519          21 :     return true;
    1520             :   }
    1521             :   return false;
    1522             : }
    1523             : 
    1524             : // Find loads and stores that can be merged into a single load or store pair
    1525             : // instruction.
    1526        7738 : bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
    1527        7738 :   MachineInstr &MI = *MBBI;
    1528       15476 :   MachineBasicBlock::iterator E = MI.getParent()->end();
    1529             : 
    1530        7738 :   if (!TII->isCandidateToMergeOrPair(MI))
    1531             :     return false;
    1532             : 
    1533             :   // Early exit if the offset is not possible to match. (6 bits of positive
    1534             :   // range, plus allow an extra one in case we find a later insn that matches
    1535             :   // with Offset-1)
    1536        5518 :   bool IsUnscaled = TII->isUnscaledLdSt(MI);
    1537        5518 :   int Offset = getLdStOffsetOp(MI).getImm();
    1538        5518 :   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
    1539             :   // Allow one more for offset.
    1540        5518 :   if (Offset > 0)
    1541        1936 :     Offset -= OffsetStride;
    1542        5443 :   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    1543             :     return false;
    1544             : 
    1545             :   // Look ahead up to LdStLimit instructions for a pairable instruction.
    1546        5410 :   LdStPairFlags Flags;
    1547             :   MachineBasicBlock::iterator Paired =
    1548        5410 :       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
    1549        5410 :   if (Paired != E) {
    1550         837 :     ++NumPairCreated;
    1551         837 :     if (TII->isUnscaledLdSt(MI))
    1552             :       ++NumUnscaledPairCreated;
    1553             :     // Keeping the iterator straight is a pain, so we let the merge routine tell
    1554             :     // us what the next instruction is after it's done mucking about.
    1555         837 :     MBBI = mergePairedInsns(MBBI, Paired, Flags);
    1556         837 :     return true;
    1557             :   }
    1558             :   return false;
    1559             : }
    1560             : 
    1561       12628 : bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
    1562             :                                         bool EnableNarrowZeroStOpt) {
    1563       12628 :   bool Modified = false;
    1564             :   // Four tranformations to do here:
    1565             :   // 1) Find loads that directly read from stores and promote them by
    1566             :   //    replacing with mov instructions. If the store is wider than the load,
    1567             :   //    the load will be replaced with a bitfield extract.
    1568             :   //      e.g.,
    1569             :   //        str w1, [x0, #4]
    1570             :   //        ldrh w2, [x0, #6]
    1571             :   //        ; becomes
    1572             :   //        str w1, [x0, #4]
    1573             :   //        lsr w2, w1, #16
    1574       25256 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1575       70049 :        MBBI != E;) {
    1576       57421 :     MachineInstr &MI = *MBBI;
    1577       57421 :     switch (MI.getOpcode()) {
    1578       55267 :     default:
    1579             :       // Just move on to the next instruction.
    1580             :       ++MBBI;
    1581             :       break;
    1582             :     // Scaled instructions.
    1583        2154 :     case AArch64::LDRBBui:
    1584             :     case AArch64::LDRHHui:
    1585             :     case AArch64::LDRWui:
    1586             :     case AArch64::LDRXui:
    1587             :     // Unscaled instructions.
    1588             :     case AArch64::LDURBBi:
    1589             :     case AArch64::LDURHHi:
    1590             :     case AArch64::LDURWi:
    1591             :     case AArch64::LDURXi:
    1592        2154 :       if (tryToPromoteLoadFromStore(MBBI)) {
    1593             :         Modified = true;
    1594             :         break;
    1595             :       }
    1596             :       ++MBBI;
    1597             :       break;
    1598             :     }
    1599             :   }
    1600             :   // 2) Merge adjacent zero stores into a wider store.
    1601             :   //      e.g.,
    1602             :   //        strh wzr, [x0]
    1603             :   //        strh wzr, [x0, #2]
    1604             :   //        ; becomes
    1605             :   //        str wzr, [x0]
    1606             :   //      e.g.,
    1607             :   //        str wzr, [x0]
    1608             :   //        str wzr, [x0, #4]
    1609             :   //        ; becomes
    1610             :   //        str xzr, [x0]
    1611       25256 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1612      138438 :        EnableNarrowZeroStOpt && MBBI != E;) {
    1613       56787 :     if (isPromotableZeroStoreInst(*MBBI)) {
    1614         105 :       if (tryToMergeZeroStInst(MBBI)) {
    1615             :         Modified = true;
    1616             :       } else
    1617             :         ++MBBI;
    1618             :     } else
    1619             :       ++MBBI;
    1620             :   }
    1621             : 
    1622             :   // 3) Find loads and stores that can be merged into a single load or store
    1623             :   //    pair instruction.
    1624             :   //      e.g.,
    1625             :   //        ldr x0, [x2]
    1626             :   //        ldr x1, [x2, #8]
    1627             :   //        ; becomes
    1628             :   //        ldp x0, x1, [x2]
    1629       25256 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1630       69224 :        MBBI != E;) {
    1631       56596 :     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
    1632             :       Modified = true;
    1633             :     else
    1634             :       ++MBBI;
    1635             :   }
    1636             :   // 4) Find base register updates that can be merged into the load or store
    1637             :   //    as a base-reg writeback.
    1638             :   //      e.g.,
    1639             :   //        ldr x0, [x2]
    1640             :   //        add x2, x2, #4
    1641             :   //        ; becomes
    1642             :   //        ldr x0, [x2], #4
    1643       25256 :   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    1644       69066 :        MBBI != E;) {
    1645       56438 :     MachineInstr &MI = *MBBI;
    1646             :     // Do update merging. It's simpler to keep this separate from the above
    1647             :     // switchs, though not strictly necessary.
    1648       56438 :     unsigned Opc = MI.getOpcode();
    1649       56438 :     switch (Opc) {
    1650       47029 :     default:
    1651             :       // Just move on to the next instruction.
    1652             :       ++MBBI;
    1653             :       break;
    1654             :     // Scaled instructions.
    1655        9409 :     case AArch64::STRSui:
    1656             :     case AArch64::STRDui:
    1657             :     case AArch64::STRQui:
    1658             :     case AArch64::STRXui:
    1659             :     case AArch64::STRWui:
    1660             :     case AArch64::STRHHui:
    1661             :     case AArch64::STRBBui:
    1662             :     case AArch64::LDRSui:
    1663             :     case AArch64::LDRDui:
    1664             :     case AArch64::LDRQui:
    1665             :     case AArch64::LDRXui:
    1666             :     case AArch64::LDRWui:
    1667             :     case AArch64::LDRHHui:
    1668             :     case AArch64::LDRBBui:
    1669             :     // Unscaled instructions.
    1670             :     case AArch64::STURSi:
    1671             :     case AArch64::STURDi:
    1672             :     case AArch64::STURQi:
    1673             :     case AArch64::STURWi:
    1674             :     case AArch64::STURXi:
    1675             :     case AArch64::LDURSi:
    1676             :     case AArch64::LDURDi:
    1677             :     case AArch64::LDURQi:
    1678             :     case AArch64::LDURWi:
    1679             :     case AArch64::LDURXi:
    1680             :     // Paired instructions.
    1681             :     case AArch64::LDPSi:
    1682             :     case AArch64::LDPSWi:
    1683             :     case AArch64::LDPDi:
    1684             :     case AArch64::LDPQi:
    1685             :     case AArch64::LDPWi:
    1686             :     case AArch64::LDPXi:
    1687             :     case AArch64::STPSi:
    1688             :     case AArch64::STPDi:
    1689             :     case AArch64::STPQi:
    1690             :     case AArch64::STPWi:
    1691             :     case AArch64::STPXi: {
    1692             :       // Make sure this is a reg+imm (as opposed to an address reloc).
    1693       18818 :       if (!getLdStOffsetOp(MI).isImm()) {
    1694             :         ++MBBI;
    1695             :         break;
    1696             :       }
    1697             :       // Look forward to try to form a post-index instruction. For example,
    1698             :       // ldr x0, [x20]
    1699             :       // add x20, x20, #32
    1700             :       //   merged into:
    1701             :       // ldr x0, [x20], #32
    1702             :       MachineBasicBlock::iterator Update =
    1703        7964 :           findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
    1704        7964 :       if (Update != E) {
    1705             :         // Merge the update into the ld/st.
    1706          99 :         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
    1707          99 :         Modified = true;
    1708             :         ++NumPostFolded;
    1709             :         break;
    1710             :       }
    1711             : 
    1712             :       // Don't know how to handle unscaled pre/post-index versions below, so
    1713             :       // move to the next instruction.
    1714        7865 :       if (TII->isUnscaledLdSt(Opc)) {
    1715             :         ++MBBI;
    1716             :         break;
    1717             :       }
    1718             : 
    1719             :       // Look back to try to find a pre-index instruction. For example,
    1720             :       // add x0, x0, #8
    1721             :       // ldr x1, [x0]
    1722             :       //   merged into:
    1723             :       // ldr x1, [x0, #8]!
    1724        7637 :       Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
    1725        7637 :       if (Update != E) {
    1726             :         // Merge the update into the ld/st.
    1727          45 :         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1728          45 :         Modified = true;
    1729             :         ++NumPreFolded;
    1730             :         break;
    1731             :       }
    1732             :       // The immediate in the load/store is scaled by the size of the memory
    1733             :       // operation. The immediate in the add we're looking for,
    1734             :       // however, is not, so adjust here.
    1735        7592 :       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
    1736             : 
    1737             :       // Look forward to try to find a post-index instruction. For example,
    1738             :       // ldr x1, [x0, #64]
    1739             :       // add x0, x0, #64
    1740             :       //   merged into:
    1741             :       // ldr x1, [x0, #64]!
    1742        7592 :       Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
    1743        7592 :       if (Update != E) {
    1744             :         // Merge the update into the ld/st.
    1745          37 :         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
    1746          37 :         Modified = true;
    1747             :         ++NumPreFolded;
    1748             :         break;
    1749             :       }
    1750             : 
    1751             :       // Nothing found. Just move to the next instruction.
    1752             :       ++MBBI;
    1753             :       break;
    1754             :     }
    1755             :     }
    1756             :   }
    1757             : 
    1758       12628 :   return Modified;
    1759             : }
    1760             : 
    1761       10663 : bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
    1762       10663 :   if (skipFunction(*Fn.getFunction()))
    1763             :     return false;
    1764             : 
    1765       10662 :   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
    1766       21324 :   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
    1767       21324 :   TRI = Subtarget->getRegisterInfo();
    1768       21324 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
    1769             : 
    1770             :   // Resize the modified and used register bitfield trackers.  We do this once
    1771             :   // per function and then clear the bitfield each time we optimize a load or
    1772             :   // store.
    1773       10662 :   ModifiedRegs.resize(TRI->getNumRegs());
    1774       10662 :   UsedRegs.resize(TRI->getNumRegs());
    1775             : 
    1776       10662 :   bool Modified = false;
    1777       10662 :   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
    1778       44614 :   for (auto &MBB : Fn)
    1779       12628 :     Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
    1780             : 
    1781             :   return Modified;
    1782             : }
    1783             : 
    1784             : // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
    1785             : // stores near one another?  Note: The pre-RA instruction scheduler already has
    1786             : // hooks to try and schedule pairable loads/stores together to improve pairing
    1787             : // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
    1788             : 
    1789             : // FIXME: When pairing store instructions it's very possible for this pass to
    1790             : // hoist a store with a KILL marker above another use (without a KILL marker).
    1791             : // The resulting IR is invalid, but nothing uses the KILL markers after this
    1792             : // pass, so it's never caused a problem in practice.
    1793             : 
    1794             : /// createAArch64LoadStoreOptimizationPass - returns an instance of the
    1795             : /// load / store optimization pass.
    1796         907 : FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
    1797         907 :   return new AArch64LoadStoreOpt();
    1798      216918 : }

Generated by: LCOV version 1.13