LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SILoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 219 221 99.1 %
Date: 2017-09-14 15:23:50 Functions: 17 19 89.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This pass tries to fuse DS instructions with close by immediate offsets.
      11             : // This will fuse operations such as
      12             : //  ds_read_b32 v0, v2 offset:16
      13             : //  ds_read_b32 v1, v2 offset:32
      14             : // ==>
      15             : //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
      16             : //
      17             : //
      18             : // Future improvements:
      19             : //
      20             : // - This currently relies on the scheduler to place loads and stores next to
      21             : //   each other, and then only merges adjacent pairs of instructions. It would
      22             : //   be good to be more flexible with interleaved instructions, and possibly run
      23             : //   before scheduling. It currently missing stores of constants because loading
      24             : //   the constant into the data register is placed between the stores, although
      25             : //   this is arguably a scheduling problem.
      26             : //
      27             : // - Live interval recomputing seems inefficient. This currently only matches
      28             : //   one pair, and recomputes live intervals and moves on to the next pair. It
      29             : //   would be better to compute a list of all merges that need to occur.
      30             : //
      31             : // - With a list of instructions to process, we can also merge more. If a
      32             : //   cluster of loads have offsets that are too large to fit in the 8-bit
      33             : //   offsets, but are close enough to fit in the 8 bits, we can add to the base
      34             : //   pointer and use the new reduced offsets.
      35             : //
      36             : //===----------------------------------------------------------------------===//
      37             : 
      38             : #include "AMDGPU.h"
      39             : #include "AMDGPUSubtarget.h"
      40             : #include "SIInstrInfo.h"
      41             : #include "SIRegisterInfo.h"
      42             : #include "Utils/AMDGPUBaseInfo.h"
      43             : #include "llvm/ADT/ArrayRef.h"
      44             : #include "llvm/ADT/SmallVector.h"
      45             : #include "llvm/ADT/StringRef.h"
      46             : #include "llvm/Analysis/AliasAnalysis.h"
      47             : #include "llvm/CodeGen/MachineBasicBlock.h"
      48             : #include "llvm/CodeGen/MachineFunction.h"
      49             : #include "llvm/CodeGen/MachineFunctionPass.h"
      50             : #include "llvm/CodeGen/MachineInstr.h"
      51             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      52             : #include "llvm/CodeGen/MachineOperand.h"
      53             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      54             : #include "llvm/IR/DebugLoc.h"
      55             : #include "llvm/Pass.h"
      56             : #include "llvm/Support/Debug.h"
      57             : #include "llvm/Support/MathExtras.h"
      58             : #include "llvm/Support/raw_ostream.h"
      59             : #include <algorithm>
      60             : #include <cassert>
      61             : #include <cstdlib>
      62             : #include <iterator>
      63             : #include <utility>
      64             : 
      65             : using namespace llvm;
      66             : 
      67             : #define DEBUG_TYPE "si-load-store-opt"
      68             : 
      69             : namespace {
      70             : 
      71        1416 : class SILoadStoreOptimizer : public MachineFunctionPass {
      72     1990200 :   using CombineInfo = struct {
      73             :     MachineBasicBlock::iterator I;
      74             :     MachineBasicBlock::iterator Paired;
      75             :     unsigned EltSize;
      76             :     unsigned Offset0;
      77             :     unsigned Offset1;
      78             :     unsigned BaseOff;
      79             :     bool UseST64;
      80             :     SmallVector<MachineInstr*, 8> InstsToMove;
      81             :    };
      82             : 
      83             : private:
      84             :   const SIInstrInfo *TII = nullptr;
      85             :   const SIRegisterInfo *TRI = nullptr;
      86             :   MachineRegisterInfo *MRI = nullptr;
      87             :   AliasAnalysis *AA = nullptr;
      88             : 
      89             :   static bool offsetsCanBeCombined(CombineInfo &CI);
      90             : 
      91             :   bool findMatchingDSInst(CombineInfo &CI);
      92             : 
      93             :   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
      94             : 
      95             :   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
      96             : 
      97             : public:
      98             :   static char ID;
      99             : 
     100        1424 :   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     101        1424 :     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     102        1424 :   }
     103             : 
     104             :   bool optimizeBlock(MachineBasicBlock &MBB);
     105             : 
     106             :   bool runOnMachineFunction(MachineFunction &MF) override;
     107             : 
     108        1416 :   StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
     109             : 
     110        1416 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     111        1416 :     AU.setPreservesCFG();
     112        1416 :     AU.addRequired<AAResultsWrapperPass>();
     113             : 
     114        1416 :     MachineFunctionPass::getAnalysisUsage(AU);
     115        1416 :   }
     116             : };
     117             : 
     118             : } // end anonymous namespace.
     119             : 
     120       53042 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
     121             :                       "SI Load / Store Optimizer", false, false)
     122       53042 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     123      316810 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
     124             :                     "SI Load / Store Optimizer", false, false)
     125             : 
     126             : char SILoadStoreOptimizer::ID = 0;
     127             : 
     128             : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
     129             : 
     130           0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
     131           0 :   return new SILoadStoreOptimizer();
     132             : }
     133             : 
     134        1940 : static void moveInstsAfter(MachineBasicBlock::iterator I,
     135             :                            ArrayRef<MachineInstr*> InstsToMove) {
     136        1940 :   MachineBasicBlock *MBB = I->getParent();
     137        1940 :   ++I;
     138        2627 :   for (MachineInstr *MI : InstsToMove) {
     139         687 :     MI->removeFromParent();
     140         687 :     MBB->insert(I, MI);
     141             :   }
     142        1940 : }
     143             : 
     144        2998 : static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
     145             :   // XXX: Should this be looking for implicit defs?
     146        4450 :   for (const MachineOperand &Def : MI.defs())
     147        2904 :     Defs.insert(Def.getReg());
     148        2998 : }
     149             : 
     150         332 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
     151             :                                       MachineBasicBlock::iterator B,
     152             :                                       const SIInstrInfo *TII,
     153             :                                       AliasAnalysis * AA) {
     154             :   // RAW or WAR - cannot reorder
     155             :   // WAW - cannot reorder
     156             :   // RAR - safe to reorder
     157         793 :   return !(A->mayStore() || B->mayStore()) ||
     158         742 :     TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
     159             : }
     160             : 
     161             : // Add MI and its defs to the lists if MI reads one of the defs that are
     162             : // already in the list. Returns true in that case.
     163             : static bool
     164        4873 : addToListsIfDependent(MachineInstr &MI,
     165             :                       DenseSet<unsigned> &Defs,
     166             :                       SmallVectorImpl<MachineInstr*> &Insts) {
     167       27645 :   for (MachineOperand &Use : MI.operands()) {
     168             :     // If one of the defs is read, then there is a use of Def between I and the
     169             :     // instruction that I will potentially be merged with. We will need to move
     170             :     // this instruction after the merged instructions.
     171             : 
     172       70580 :     if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
     173         808 :       Insts.push_back(&MI);
     174         808 :       addDefsToList(MI, Defs);
     175         808 :       return true;
     176             :     }
     177             :   }
     178             : 
     179             :   return false;
     180             : }
     181             : 
     182             : static bool
     183        1977 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
     184             :                         ArrayRef<MachineInstr*> InstsToMove,
     185             :                         const SIInstrInfo *TII,
     186             :                         AliasAnalysis *AA) {
     187             :   assert(MemOp.mayLoadOrStore());
     188             : 
     189        4650 :   for (MachineInstr *InstToMove : InstsToMove) {
     190         710 :     if (!InstToMove->mayLoadOrStore())
     191         689 :       continue;
     192          42 :     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
     193             :         return false;
     194             :   }
     195             :   return true;
     196             : }
     197             : 
     198        1960 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
     199             :   // XXX - Would the same offset be OK? Is there any reason this would happen or
     200             :   // be useful?
     201        1960 :   if (CI.Offset0 == CI.Offset1)
     202             :     return false;
     203             : 
     204             :   // This won't be valid if the offset isn't aligned.
     205        1951 :   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     206             :     return false;
     207             : 
     208        1951 :   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
     209        1951 :   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
     210        1951 :   CI.UseST64 = false;
     211        1951 :   CI.BaseOff = 0;
     212             : 
     213             :   // If the offset in elements doesn't fit in 8-bits, we might be able to use
     214             :   // the stride 64 versions.
     215        2144 :   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
     216        2011 :       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
     217          30 :     CI.Offset0 = EltOffset0 / 64;
     218          30 :     CI.Offset1 = EltOffset1 / 64;
     219          30 :     CI.UseST64 = true;
     220          30 :     return true;
     221             :   }
     222             : 
     223             :   // Check if the new offsets fit in the reduced 8-bit range.
     224        1921 :   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
     225        1864 :     CI.Offset0 = EltOffset0;
     226        1864 :     CI.Offset1 = EltOffset1;
     227        1864 :     return true;
     228             :   }
     229             : 
     230             :   // Try to shift base address to decrease offsets.
     231          57 :   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
     232         114 :   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
     233             : 
     234          57 :   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
     235          24 :     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
     236          24 :     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
     237          24 :     CI.UseST64 = true;
     238          24 :     return true;
     239             :   }
     240             : 
     241          33 :   if (isUInt<8>(OffsetDiff)) {
     242          28 :     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
     243          28 :     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
     244          28 :     return true;
     245             :   }
     246             : 
     247             :   return false;
     248             : }
     249             : 
     250        2901 : bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
     251        5802 :   MachineBasicBlock *MBB = CI.I->getParent();
     252        2901 :   MachineBasicBlock::iterator E = MBB->end();
     253        2901 :   MachineBasicBlock::iterator MBBI = CI.I;
     254             : 
     255        8703 :   int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     256        2901 :                                            AMDGPU::OpName::addr);
     257        8703 :   const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
     258             : 
     259             :   // We only ever merge operations with the same base address register, so don't
     260             :   // bother scanning forward if there are no other uses.
     261        8703 :   if (TargetRegisterInfo::isPhysicalRegister(AddrReg0.getReg()) ||
     262        2901 :       MRI->hasOneNonDBGUse(AddrReg0.getReg()))
     263             :     return false;
     264             : 
     265        2113 :   ++MBBI;
     266             : 
     267        2113 :   DenseSet<unsigned> DefsToMove;
     268        4226 :   addDefsToList(*CI.I, DefsToMove);
     269             : 
     270        5103 :   for ( ; MBBI != E; ++MBBI) {
     271       22740 :     if (MBBI->getOpcode() != CI.I->getOpcode()) {
     272             :       // This is not a matching DS instruction, but we can keep looking as
     273             :       // long as one of these conditions are met:
     274             :       // 1. It is safe to move I down past MBBI.
     275             :       // 2. It is safe to move MBBI down past the instruction that I will
     276             :       //    be merged into.
     277             : 
     278        2976 :       if (MBBI->hasUnmodeledSideEffects()) {
     279             :         // We can't re-order this instruction with respect to other memory
     280             :         // operations, so we fail both conditions mentioned above.
     281             :         return false;
     282             :       }
     283             : 
     284        3310 :       if (MBBI->mayLoadOrStore() &&
     285        4305 :         !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
     286             :         // We fail condition #1, but we may still be able to satisfy condition
     287             :         // #2.  Add this instruction to the move list and then we will check
     288             :         // if condition #2 holds once we have selected the matching instruction.
     289         154 :         CI.InstsToMove.push_back(&*MBBI);
     290          77 :         addDefsToList(*MBBI, DefsToMove);
     291          77 :         continue;
     292             :       }
     293             : 
     294             :       // When we match I with another DS instruction we will be moving I down
     295             :       // to the location of the matched instruction any uses of I will need to
     296             :       // be moved down as well.
     297        5776 :       addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
     298        2888 :       continue;
     299             :     }
     300             : 
     301             :     // Don't merge volatiles.
     302        1987 :     if (MBBI->hasOrderedMemoryRef())
     303             :       return false;
     304             : 
     305             :     // Handle a case like
     306             :     //   DS_WRITE_B32 addr, v, idx0
     307             :     //   w = DS_READ_B32 addr, idx0
     308             :     //   DS_WRITE_B32 addr, f(w), idx1
     309             :     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     310             :     // merging of the two writes.
     311        3970 :     if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
     312           2 :       continue;
     313             : 
     314        3966 :     const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
     315             : 
     316             :     // Check same base pointer. Be careful of subregisters, which can occur with
     317             :     // vectors of pointers.
     318        3943 :     if (AddrReg0.getReg() == AddrReg1.getReg() &&
     319        3920 :         AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
     320        5880 :       int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     321        1960 :                                                  AMDGPU::OpName::offset);
     322        5880 :       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
     323        3920 :       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
     324        1960 :       CI.Paired = MBBI;
     325             : 
     326             :       // Check both offsets fit in the reduced range.
     327             :       // We also need to go through the list of instructions that we plan to
     328             :       // move and make sure they are all safe to move down past the merged
     329             :       // instruction.
     330        1960 :       if (offsetsCanBeCombined(CI))
     331        5838 :         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     332             :           return true;
     333             :     }
     334             : 
     335             :     // We've found a load/store that we couldn't merge for some reason.
     336             :     // We could potentially keep looking, but we'd need to make sure that
     337             :     // it was safe to move I and also all the instruction in InstsToMove
     338             :     // down past this instruction.
     339             :     // check if we can move I across MBBI and if we can move all I's users
     340         289 :     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     341         136 :       !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     342             :       break;
     343             :   }
     344             :   return false;
     345             : }
     346             : 
     347         594 : MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     348             :   CombineInfo &CI) {
     349        1188 :   MachineBasicBlock *MBB = CI.I->getParent();
     350             : 
     351             :   // Be careful, since the addresses could be subregisters themselves in weird
     352             :   // cases, like vectors of pointers.
     353        1188 :   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     354             : 
     355        1188 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
     356        1188 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
     357             : 
     358         594 :   unsigned NewOffset0 = CI.Offset0;
     359         594 :   unsigned NewOffset1 = CI.Offset1;
     360         594 :   unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
     361             :                                    : AMDGPU::DS_READ2_B64;
     362             : 
     363         594 :   if (CI.UseST64)
     364          30 :     Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
     365             :                             : AMDGPU::DS_READ2ST64_B64;
     366             : 
     367         594 :   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
     368         594 :   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
     369             : 
     370         594 :   if (NewOffset0 > NewOffset1) {
     371             :     // Canonicalize the merged instruction so the smaller offset comes first.
     372         253 :     std::swap(NewOffset0, NewOffset1);
     373             :     std::swap(SubRegIdx0, SubRegIdx1);
     374             :   }
     375             : 
     376             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     377             :          (NewOffset0 != NewOffset1) &&
     378             :          "Computed offset doesn't fit");
     379             : 
     380        1188 :   const MCInstrDesc &Read2Desc = TII->get(Opc);
     381             : 
     382         594 :   const TargetRegisterClass *SuperRC
     383         594 :     = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
     384         594 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     385             : 
     386        2376 :   DebugLoc DL = CI.I->getDebugLoc();
     387             : 
     388         594 :   unsigned BaseReg = AddrReg->getReg();
     389         594 :   unsigned BaseRegFlags = 0;
     390         594 :   if (CI.BaseOff) {
     391          26 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     392          26 :     BaseRegFlags = RegState::Kill;
     393          78 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
     394          52 :            .addImm(CI.BaseOff)
     395          26 :            .addReg(AddrReg->getReg());
     396             :   }
     397             : 
     398             :   MachineInstrBuilder Read2 =
     399        1188 :     BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
     400         594 :       .addReg(BaseReg, BaseRegFlags) // addr
     401        1188 :       .addImm(NewOffset0)            // offset0
     402        1188 :       .addImm(NewOffset1)            // offset1
     403         594 :       .addImm(0)                     // gds
     404        2376 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     405             : 
     406             :   (void)Read2;
     407             : 
     408        1188 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     409             : 
     410             :   // Copy to the old destination registers.
     411        1188 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     412         594 :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     413         594 :       .addReg(DestReg, 0, SubRegIdx0);
     414        1188 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     415         594 :                             .add(*Dest1)
     416         594 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     417             : 
     418        1782 :   moveInstsAfter(Copy1, CI.InstsToMove);
     419             : 
     420         594 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     421        1188 :   CI.I->eraseFromParent();
     422        1188 :   CI.Paired->eraseFromParent();
     423             : 
     424             :   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
     425        1188 :   return Next;
     426             : }
     427             : 
     428        1346 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     429             :   CombineInfo &CI) {
     430        2692 :   MachineBasicBlock *MBB = CI.I->getParent();
     431             : 
     432             :   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
     433             :   // sure we preserve the subregister index and any register flags set on them.
     434        2692 :   const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     435        2692 :   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
     436             :   const MachineOperand *Data1
     437        2692 :     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
     438             : 
     439        1346 :   unsigned NewOffset0 = CI.Offset0;
     440        1346 :   unsigned NewOffset1 = CI.Offset1;
     441        1346 :   unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
     442             :                                    : AMDGPU::DS_WRITE2_B64;
     443             : 
     444        1346 :   if (CI.UseST64)
     445          24 :     Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
     446             :                             : AMDGPU::DS_WRITE2ST64_B64;
     447             : 
     448        1346 :   if (NewOffset0 > NewOffset1) {
     449             :     // Canonicalize the merged instruction so the smaller offset comes first.
     450        1211 :     std::swap(NewOffset0, NewOffset1);
     451             :     std::swap(Data0, Data1);
     452             :   }
     453             : 
     454             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     455             :          (NewOffset0 != NewOffset1) &&
     456             :          "Computed offset doesn't fit");
     457             : 
     458        2692 :   const MCInstrDesc &Write2Desc = TII->get(Opc);
     459        5384 :   DebugLoc DL = CI.I->getDebugLoc();
     460             : 
     461        1346 :   unsigned BaseReg = Addr->getReg();
     462        1346 :   unsigned BaseRegFlags = 0;
     463        1346 :   if (CI.BaseOff) {
     464          26 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     465          26 :     BaseRegFlags = RegState::Kill;
     466          78 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
     467          52 :            .addImm(CI.BaseOff)
     468          26 :            .addReg(Addr->getReg());
     469             :   }
     470             : 
     471             :   MachineInstrBuilder Write2 =
     472        2692 :     BuildMI(*MBB, CI.Paired, DL, Write2Desc)
     473        1346 :       .addReg(BaseReg, BaseRegFlags) // addr
     474        2692 :       .add(*Data0)                   // data0
     475        2692 :       .add(*Data1)                   // data1
     476        2692 :       .addImm(NewOffset0)            // offset0
     477        2692 :       .addImm(NewOffset1)            // offset1
     478        1346 :       .addImm(0)                     // gds
     479        5384 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     480             : 
     481        4038 :   moveInstsAfter(Write2, CI.InstsToMove);
     482             : 
     483        1346 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     484        2692 :   CI.I->eraseFromParent();
     485        2692 :   CI.Paired->eraseFromParent();
     486             : 
     487             :   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
     488        2692 :   return Next;
     489             : }
     490             : 
     491             : // Scan through looking for adjacent LDS operations with constant offsets from
     492             : // the same base register. We rely on the scheduler to do the hard work of
     493             : // clustering nearby loads, and assume these are all adjacent.
     494       16540 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
     495       16540 :   bool Modified = false;
     496             : 
     497      397148 :   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     498      347528 :     MachineInstr &MI = *I;
     499             : 
     500             :     // Don't combine if volatile.
     501      363356 :     if (MI.hasOrderedMemoryRef()) {
     502       15828 :       ++I;
     503       34557 :       continue;
     504             :     }
     505             : 
     506      660499 :     CombineInfo CI;
     507      331700 :     CI.I = I;
     508      331700 :     unsigned Opc = MI.getOpcode();
     509      332927 :     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
     510        1227 :       CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
     511        1227 :       if (findMatchingDSInst(CI)) {
     512         594 :         Modified = true;
     513         594 :         I = mergeRead2Pair(CI);
     514             :       } else {
     515             :         ++I;
     516             :       }
     517             : 
     518        1227 :       continue;
     519      332147 :     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
     520        1674 :       CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
     521        1674 :       if (findMatchingDSInst(CI)) {
     522        1346 :         Modified = true;
     523        1346 :         I = mergeWrite2Pair(CI);
     524             :       } else {
     525             :         ++I;
     526             :       }
     527             : 
     528        1674 :       continue;
     529             :     }
     530             : 
     531      328799 :     ++I;
     532             :   }
     533             : 
     534       16540 :   return Modified;
     535             : }
     536             : 
     537       14649 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
     538       14649 :   if (skipFunction(*MF.getFunction()))
     539             :     return false;
     540             : 
     541       14648 :   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
     542       14648 :   if (!STM.loadStoreOptEnabled())
     543             :     return false;
     544             : 
     545       14647 :   TII = STM.getInstrInfo();
     546       29294 :   TRI = &TII->getRegisterInfo();
     547             : 
     548       14647 :   MRI = &MF.getRegInfo();
     549       29294 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     550             : 
     551             :   assert(MRI->isSSA() && "Must be run on SSA");
     552             : 
     553             :   DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
     554             : 
     555       14647 :   bool Modified = false;
     556             : 
     557       60481 :   for (MachineBasicBlock &MBB : MF)
     558       16540 :     Modified |= optimizeBlock(MBB);
     559             : 
     560             :   return Modified;
     561             : }

Generated by: LCOV version 1.13