LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SILoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 289 408 70.8 %
Date: 2018-09-23 13:06:45 Functions: 16 25 64.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This pass tries to fuse DS instructions with close by immediate offsets.
      11             : // This will fuse operations such as
      12             : //  ds_read_b32 v0, v2 offset:16
      13             : //  ds_read_b32 v1, v2 offset:32
      14             : // ==>
      15             : //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
      16             : //
      17             : // The same is done for certain SMEM and VMEM opcodes, e.g.:
      18             : //  s_buffer_load_dword s4, s[0:3], 4
      19             : //  s_buffer_load_dword s5, s[0:3], 8
      20             : // ==>
      21             : //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
      22             : //
      23             : //
      24             : // Future improvements:
      25             : //
      26             : // - This currently relies on the scheduler to place loads and stores next to
      27             : //   each other, and then only merges adjacent pairs of instructions. It would
      28             : //   be good to be more flexible with interleaved instructions, and possibly run
      29             : //   before scheduling. It currently missing stores of constants because loading
      30             : //   the constant into the data register is placed between the stores, although
      31             : //   this is arguably a scheduling problem.
      32             : //
      33             : // - Live interval recomputing seems inefficient. This currently only matches
      34             : //   one pair, and recomputes live intervals and moves on to the next pair. It
      35             : //   would be better to compute a list of all merges that need to occur.
      36             : //
      37             : // - With a list of instructions to process, we can also merge more. If a
      38             : //   cluster of loads have offsets that are too large to fit in the 8-bit
      39             : //   offsets, but are close enough to fit in the 8 bits, we can add to the base
      40             : //   pointer and use the new reduced offsets.
      41             : //
      42             : //===----------------------------------------------------------------------===//
      43             : 
      44             : #include "AMDGPU.h"
      45             : #include "AMDGPUSubtarget.h"
      46             : #include "SIInstrInfo.h"
      47             : #include "SIRegisterInfo.h"
      48             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      49             : #include "Utils/AMDGPUBaseInfo.h"
      50             : #include "llvm/ADT/ArrayRef.h"
      51             : #include "llvm/ADT/SmallVector.h"
      52             : #include "llvm/ADT/StringRef.h"
      53             : #include "llvm/Analysis/AliasAnalysis.h"
      54             : #include "llvm/CodeGen/MachineBasicBlock.h"
      55             : #include "llvm/CodeGen/MachineFunction.h"
      56             : #include "llvm/CodeGen/MachineFunctionPass.h"
      57             : #include "llvm/CodeGen/MachineInstr.h"
      58             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      59             : #include "llvm/CodeGen/MachineOperand.h"
      60             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      61             : #include "llvm/IR/DebugLoc.h"
      62             : #include "llvm/Pass.h"
      63             : #include "llvm/Support/Debug.h"
      64             : #include "llvm/Support/MathExtras.h"
      65             : #include "llvm/Support/raw_ostream.h"
      66             : #include <algorithm>
      67             : #include <cassert>
      68             : #include <cstdlib>
      69             : #include <iterator>
      70             : #include <utility>
      71             : 
      72             : using namespace llvm;
      73             : 
      74             : #define DEBUG_TYPE "si-load-store-opt"
      75             : 
      76             : namespace {
      77             : 
      78             : class SILoadStoreOptimizer : public MachineFunctionPass {
      79             :   enum InstClassEnum {
      80             :     DS_READ_WRITE,
      81             :     S_BUFFER_LOAD_IMM,
      82             :     BUFFER_LOAD_OFFEN,
      83             :     BUFFER_LOAD_OFFSET,
      84             :     BUFFER_STORE_OFFEN,
      85             :     BUFFER_STORE_OFFSET,
      86             :   };
      87             : 
      88             :   struct CombineInfo {
      89             :     MachineBasicBlock::iterator I;
      90             :     MachineBasicBlock::iterator Paired;
      91             :     unsigned EltSize;
      92             :     unsigned Offset0;
      93             :     unsigned Offset1;
      94             :     unsigned BaseOff;
      95             :     InstClassEnum InstClass;
      96             :     bool GLC0;
      97             :     bool GLC1;
      98             :     bool SLC0;
      99             :     bool SLC1;
     100             :     bool UseST64;
     101             :     bool IsX2;
     102             :     SmallVector<MachineInstr*, 8> InstsToMove;
     103             :    };
     104             : 
     105             : private:
     106             :   const GCNSubtarget *STM = nullptr;
     107             :   const SIInstrInfo *TII = nullptr;
     108             :   const SIRegisterInfo *TRI = nullptr;
     109             :   MachineRegisterInfo *MRI = nullptr;
     110             :   AliasAnalysis *AA = nullptr;
     111             :   unsigned CreatedX2;
     112             : 
     113             :   static bool offsetsCanBeCombined(CombineInfo &CI);
     114             : 
     115             :   bool findMatchingInst(CombineInfo &CI);
     116             : 
     117             :   unsigned read2Opcode(unsigned EltSize) const;
     118             :   unsigned read2ST64Opcode(unsigned EltSize) const;
     119             :   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
     120             : 
     121             :   unsigned write2Opcode(unsigned EltSize) const;
     122             :   unsigned write2ST64Opcode(unsigned EltSize) const;
     123             :   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
     124             :   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
     125             :   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
     126             :   unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
     127             :                                     bool &IsOffen) const;
     128             :   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
     129             : 
     130             : public:
     131             :   static char ID;
     132             : 
     133        1865 :   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     134        1865 :     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     135        1865 :   }
     136             : 
     137             :   bool optimizeBlock(MachineBasicBlock &MBB);
     138             : 
     139             :   bool runOnMachineFunction(MachineFunction &MF) override;
     140             : 
     141        1851 :   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
     142             : 
     143        1851 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     144        1851 :     AU.setPreservesCFG();
     145             :     AU.addRequired<AAResultsWrapperPass>();
     146             : 
     147        1851 :     MachineFunctionPass::getAnalysisUsage(AU);
     148        1851 :   }
     149             : };
     150             : 
     151             : } // end anonymous namespace.
     152             : 
     153       92970 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
     154             :                       "SI Load Store Optimizer", false, false)
     155       92970 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     156      219113 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
     157             :                     "SI Load Store Optimizer", false, false)
     158             : 
     159             : char SILoadStoreOptimizer::ID = 0;
     160             : 
     161             : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
     162             : 
     163           0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
     164           0 :   return new SILoadStoreOptimizer();
     165             : }
     166             : 
     167        3375 : static void moveInstsAfter(MachineBasicBlock::iterator I,
     168             :                            ArrayRef<MachineInstr*> InstsToMove) {
     169        3375 :   MachineBasicBlock *MBB = I->getParent();
     170             :   ++I;
     171        4434 :   for (MachineInstr *MI : InstsToMove) {
     172        1059 :     MI->removeFromParent();
     173             :     MBB->insert(I, MI);
     174             :   }
     175        3375 : }
     176             : 
     177        8219 : static void addDefsUsesToList(const MachineInstr &MI,
     178             :                               DenseSet<unsigned> &RegDefs,
     179             :                               DenseSet<unsigned> &PhysRegUses) {
     180       57029 :   for (const MachineOperand &Op : MI.operands()) {
     181       48810 :     if (Op.isReg()) {
     182       28192 :       if (Op.isDef())
     183        4593 :         RegDefs.insert(Op.getReg());
     184       23596 :       else if (Op.readsReg() &&
     185       23596 :                TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
     186        9521 :         PhysRegUses.insert(Op.getReg());
     187             :     }
     188             :   }
     189        8219 : }
     190             : 
     191       17029 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
     192             :                                       MachineBasicBlock::iterator B,
     193             :                                       const SIInstrInfo *TII,
     194             :                                       AliasAnalysis * AA) {
     195             :   // RAW or WAR - cannot reorder
     196             :   // WAW - cannot reorder
     197             :   // RAR - safe to reorder
     198       19608 :   return !(A->mayStore() || B->mayStore()) ||
     199        2579 :     TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
     200             : }
     201             : 
     202             : // Add MI and its defs to the lists if MI reads one of the defs that are
     203             : // already in the list. Returns true in that case.
     204             : static bool
     205       61073 : addToListsIfDependent(MachineInstr &MI,
     206             :                       DenseSet<unsigned> &RegDefs,
     207             :                       DenseSet<unsigned> &PhysRegUses,
     208             :                       SmallVectorImpl<MachineInstr*> &Insts) {
     209      368655 :   for (MachineOperand &Use : MI.operands()) {
     210             :     // If one of the defs is read, then there is a use of Def between I and the
     211             :     // instruction that I will potentially be merged with. We will need to move
     212             :     // this instruction after the merged instructions.
     213             :     //
     214             :     // Similarly, if there is a def which is read by an instruction that is to
     215             :     // be moved for merging, then we need to move the def-instruction as well.
     216             :     // This can only happen for physical registers such as M0; virtual
     217             :     // registers are in SSA form.
     218      310335 :     if (Use.isReg() &&
     219      317197 :         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
     220      113406 :          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
     221      308384 :           PhysRegUses.count(Use.getReg())))) {
     222        2753 :       Insts.push_back(&MI);
     223        2753 :       addDefsUsesToList(MI, RegDefs, PhysRegUses);
     224        2753 :       return true;
     225             :     }
     226             :   }
     227             : 
     228             :   return false;
     229             : }
     230             : 
     231             : static bool
     232       19499 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
     233             :                         ArrayRef<MachineInstr*> InstsToMove,
     234             :                         const SIInstrInfo *TII,
     235             :                         AliasAnalysis *AA) {
     236             :   assert(MemOp.mayLoadOrStore());
     237             : 
     238       43213 :   for (MachineInstr *InstToMove : InstsToMove) {
     239       23761 :     if (!InstToMove->mayLoadOrStore())
     240             :       continue;
     241          64 :     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
     242             :         return false;
     243             :   }
     244             :   return true;
     245             : }
     246             : 
     247        7109 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
     248             :   // XXX - Would the same offset be OK? Is there any reason this would happen or
     249             :   // be useful?
     250        7109 :   if (CI.Offset0 == CI.Offset1)
     251             :     return false;
     252             : 
     253             :   // This won't be valid if the offset isn't aligned.
     254        7085 :   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     255             :     return false;
     256             : 
     257        7085 :   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
     258        7085 :   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
     259        7085 :   CI.UseST64 = false;
     260        7085 :   CI.BaseOff = 0;
     261             : 
     262             :   // Handle SMEM and VMEM instructions.
     263        7085 :   if (CI.InstClass != DS_READ_WRITE) {
     264        4040 :     unsigned Diff = CI.IsX2 ? 2 : 1;
     265        7727 :     return (EltOffset0 + Diff == EltOffset1 ||
     266        3687 :             EltOffset1 + Diff == EltOffset0) &&
     267        4040 :            CI.GLC0 == CI.GLC1 &&
     268        4040 :            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
     269             :   }
     270             : 
     271             :   // If the offset in elements doesn't fit in 8-bits, we might be able to use
     272             :   // the stride 64 versions.
     273         269 :   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
     274        3087 :       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
     275          42 :     CI.Offset0 = EltOffset0 / 64;
     276          42 :     CI.Offset1 = EltOffset1 / 64;
     277          42 :     CI.UseST64 = true;
     278          42 :     return true;
     279             :   }
     280             : 
     281             :   // Check if the new offsets fit in the reduced 8-bit range.
     282        3003 :   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
     283        2938 :     CI.Offset0 = EltOffset0;
     284        2938 :     CI.Offset1 = EltOffset1;
     285        2938 :     return true;
     286             :   }
     287             : 
     288             :   // Try to shift base address to decrease offsets.
     289          65 :   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
     290          65 :   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
     291             : 
     292          65 :   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
     293          24 :     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
     294          24 :     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
     295          24 :     CI.UseST64 = true;
     296          24 :     return true;
     297             :   }
     298             : 
     299          41 :   if (isUInt<8>(OffsetDiff)) {
     300          32 :     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
     301          32 :     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
     302          32 :     return true;
     303             :   }
     304             : 
     305             :   return false;
     306             : }
     307             : 
     308       17246 : bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     309       17246 :   MachineBasicBlock *MBB = CI.I->getParent();
     310             :   MachineBasicBlock::iterator E = MBB->end();
     311             :   MachineBasicBlock::iterator MBBI = CI.I;
     312             : 
     313       17246 :   unsigned AddrOpName[3] = {0};
     314             :   int AddrIdx[3];
     315             :   const MachineOperand *AddrReg[3];
     316             :   unsigned NumAddresses = 0;
     317             : 
     318       17246 :   switch (CI.InstClass) {
     319        4433 :   case DS_READ_WRITE:
     320        4433 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
     321        4433 :     break;
     322         567 :   case S_BUFFER_LOAD_IMM:
     323         567 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
     324         567 :     break;
     325        5526 :   case BUFFER_LOAD_OFFEN:
     326             :   case BUFFER_STORE_OFFEN:
     327        5526 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     328        5526 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
     329        5526 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     330        5526 :     break;
     331        6720 :   case BUFFER_LOAD_OFFSET:
     332             :   case BUFFER_STORE_OFFSET:
     333        6720 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     334        6720 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     335        6720 :     break;
     336             :   }
     337             : 
     338       22816 :   for (unsigned i = 0; i < NumAddresses; i++) {
     339       36408 :     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     340       36408 :     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
     341             : 
     342             :     // We only ever merge operations with the same base address register, so don't
     343             :     // bother scanning forward if there are no other uses.
     344       18204 :     if (AddrReg[i]->isReg() &&
     345       29461 :         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
     346       12019 :          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
     347       12634 :       return false;
     348             :   }
     349             : 
     350             :   ++MBBI;
     351             : 
     352             :   DenseSet<unsigned> RegDefsToMove;
     353             :   DenseSet<unsigned> PhysRegUsesToMove;
     354        4612 :   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
     355             : 
     356       63130 :   for ( ; MBBI != E; ++MBBI) {
     357      185937 :     if (MBBI->getOpcode() != CI.I->getOpcode()) {
     358             :       // This is not a matching DS instruction, but we can keep looking as
     359             :       // long as one of these conditions are met:
     360             :       // 1. It is safe to move I down past MBBI.
     361             :       // 2. It is safe to move MBBI down past the instruction that I will
     362             :       //    be merged into.
     363             : 
     364       54800 :       if (MBBI->hasUnmodeledSideEffects()) {
     365             :         // We can't re-order this instruction with respect to other memory
     366             :         // operations, so we fail both conditions mentioned above.
     367             :         return false;
     368             :       }
     369             : 
     370       67953 :       if (MBBI->mayLoadOrStore() &&
     371       38770 :         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     372       24744 :          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
     373             :         // We fail condition #1, but we may still be able to satisfy condition
     374             :         // #2.  Add this instruction to the move list and then we will check
     375             :         // if condition #2 holds once we have selected the matching instruction.
     376         854 :         CI.InstsToMove.push_back(&*MBBI);
     377         854 :         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
     378         854 :         continue;
     379             :       }
     380             : 
     381             :       // When we match I with another DS instruction we will be moving I down
     382             :       // to the location of the matched instruction any uses of I will need to
     383             :       // be moved down as well.
     384       53900 :       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     385             :                             CI.InstsToMove);
     386       53900 :       continue;
     387             :     }
     388             : 
     389             :     // Don't merge volatiles.
     390        7179 :     if (MBBI->hasOrderedMemoryRef())
     391             :       return false;
     392             : 
     393             :     // Handle a case like
     394             :     //   DS_WRITE_B32 addr, v, idx0
     395             :     //   w = DS_READ_B32 addr, idx0
     396             :     //   DS_WRITE_B32 addr, f(w), idx1
     397             :     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     398             :     // merging of the two writes.
     399        7173 :     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     400             :                               CI.InstsToMove))
     401             :       continue;
     402             : 
     403             :     bool Match = true;
     404       15428 :     for (unsigned i = 0; i < NumAddresses; i++) {
     405        8319 :       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
     406             : 
     407       16638 :       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
     408        1066 :         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
     409        1066 :             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
     410             :           Match = false;
     411             :           break;
     412             :         }
     413             :         continue;
     414             :       }
     415             : 
     416             :       // Check same base pointer. Be careful of subregisters, which can occur with
     417             :       // vectors of pointers.
     418        7253 :       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
     419             :           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
     420             :         Match = false;
     421             :         break;
     422             :       }
     423             :     }
     424             : 
     425        7141 :     if (Match) {
     426        7109 :       int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     427             :                                                  AMDGPU::OpName::offset);
     428        7109 :       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
     429        7109 :       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
     430        7109 :       CI.Paired = MBBI;
     431             : 
     432        7109 :       if (CI.InstClass == DS_READ_WRITE) {
     433        3061 :         CI.Offset0 &= 0xffff;
     434        3061 :         CI.Offset1 &= 0xffff;
     435             :       } else {
     436        4048 :         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
     437        4048 :         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
     438        4048 :         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
     439        1066 :           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
     440        1066 :           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
     441             :         }
     442             :       }
     443             : 
     444             :       // Check both offsets fit in the reduced range.
     445             :       // We also need to go through the list of instructions that we plan to
     446             :       // move and make sure they are all safe to move down past the merged
     447             :       // instruction.
     448        7109 :       if (offsetsCanBeCombined(CI))
     449        6762 :         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     450             :           return true;
     451             :     }
     452             : 
     453             :     // We've found a load/store that we couldn't merge for some reason.
     454             :     // We could potentially keep looking, but we'd need to make sure that
     455             :     // it was safe to move I and also all the instruction in InstsToMove
     456             :     // down past this instruction.
     457             :     // check if we can move I across MBBI and if we can move all I's users
     458       11278 :     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     459       11258 :         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     460             :       break;
     461             :   }
     462             :   return false;
     463             : }
     464             : 
     465           0 : unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
     466         928 :   if (STM->ldsRequiresM0Init())
     467         737 :     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
     468         191 :   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
     469             : }
     470             : 
     471           0 : unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
     472          38 :   if (STM->ldsRequiresM0Init())
     473          22 :     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
     474             : 
     475          16 :   return (EltSize == 4) ?
     476             :     AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
     477             : }
     478             : 
     479         966 : MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     480             :   CombineInfo &CI) {
     481         966 :   MachineBasicBlock *MBB = CI.I->getParent();
     482             : 
     483             :   // Be careful, since the addresses could be subregisters themselves in weird
     484             :   // cases, like vectors of pointers.
     485         966 :   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     486             : 
     487         966 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
     488         966 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
     489             : 
     490         966 :   unsigned NewOffset0 = CI.Offset0;
     491         966 :   unsigned NewOffset1 = CI.Offset1;
     492         966 :   unsigned Opc = CI.UseST64 ?
     493         966 :     read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
     494             : 
     495         966 :   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
     496         966 :   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
     497             : 
     498         966 :   if (NewOffset0 > NewOffset1) {
     499             :     // Canonicalize the merged instruction so the smaller offset comes first.
     500             :     std::swap(NewOffset0, NewOffset1);
     501             :     std::swap(SubRegIdx0, SubRegIdx1);
     502             :   }
     503             : 
     504             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     505             :          (NewOffset0 != NewOffset1) &&
     506             :          "Computed offset doesn't fit");
     507             : 
     508         966 :   const MCInstrDesc &Read2Desc = TII->get(Opc);
     509             : 
     510             :   const TargetRegisterClass *SuperRC
     511         966 :     = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
     512        1932 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     513             : 
     514             :   DebugLoc DL = CI.I->getDebugLoc();
     515             : 
     516         966 :   unsigned BaseReg = AddrReg->getReg();
     517             :   unsigned BaseRegFlags = 0;
     518         966 :   if (CI.BaseOff) {
     519          56 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     520          56 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     521          28 :       .addImm(CI.BaseOff);
     522             : 
     523          56 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     524             :     BaseRegFlags = RegState::Kill;
     525             : 
     526          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     527          28 :       .addReg(ImmReg)
     528          28 :       .addReg(AddrReg->getReg());
     529             :   }
     530             : 
     531         966 :   MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
     532         966 :                                   .addReg(BaseReg, BaseRegFlags) // addr
     533         966 :                                   .addImm(NewOffset0)            // offset0
     534         966 :                                   .addImm(NewOffset1)            // offset1
     535             :                                   .addImm(0)                     // gds
     536        1932 :                                   .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     537             : 
     538             :   (void)Read2;
     539             : 
     540         966 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     541             : 
     542             :   // Copy to the old destination registers.
     543         966 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     544             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     545         966 :       .addReg(DestReg, 0, SubRegIdx0);
     546         966 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     547             :                             .add(*Dest1)
     548         966 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     549             : 
     550         966 :   moveInstsAfter(Copy1, CI.InstsToMove);
     551             : 
     552         966 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     553         966 :   CI.I->eraseFromParent();
     554         966 :   CI.Paired->eraseFromParent();
     555             : 
     556             :   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
     557         966 :   return Next;
     558             : }
     559             : 
     560           0 : unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
     561        2036 :   if (STM->ldsRequiresM0Init())
     562        1429 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
     563         607 :   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
     564             : }
     565             : 
     566           0 : unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
     567          28 :   if (STM->ldsRequiresM0Init())
     568          14 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
     569             : 
     570          14 :   return (EltSize == 4) ?
     571             :     AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
     572             : }
     573             : 
     574        2064 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     575             :   CombineInfo &CI) {
     576        2064 :   MachineBasicBlock *MBB = CI.I->getParent();
     577             : 
     578             :   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
     579             :   // sure we preserve the subregister index and any register flags set on them.
     580        2064 :   const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     581        2064 :   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
     582             :   const MachineOperand *Data1
     583        2064 :     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
     584             : 
     585        2064 :   unsigned NewOffset0 = CI.Offset0;
     586        2064 :   unsigned NewOffset1 = CI.Offset1;
     587        2064 :   unsigned Opc = CI.UseST64 ?
     588        2064 :     write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
     589             : 
     590        2064 :   if (NewOffset0 > NewOffset1) {
     591             :     // Canonicalize the merged instruction so the smaller offset comes first.
     592             :     std::swap(NewOffset0, NewOffset1);
     593             :     std::swap(Data0, Data1);
     594             :   }
     595             : 
     596             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     597             :          (NewOffset0 != NewOffset1) &&
     598             :          "Computed offset doesn't fit");
     599             : 
     600        2064 :   const MCInstrDesc &Write2Desc = TII->get(Opc);
     601             :   DebugLoc DL = CI.I->getDebugLoc();
     602             : 
     603        2064 :   unsigned BaseReg = AddrReg->getReg();
     604             :   unsigned BaseRegFlags = 0;
     605        2064 :   if (CI.BaseOff) {
     606          56 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     607          56 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     608          28 :       .addImm(CI.BaseOff);
     609             : 
     610          56 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     611             :     BaseRegFlags = RegState::Kill;
     612             : 
     613          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     614          28 :       .addReg(ImmReg)
     615          28 :       .addReg(AddrReg->getReg());
     616             :   }
     617             : 
     618        2064 :   MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
     619        2064 :                                    .addReg(BaseReg, BaseRegFlags) // addr
     620             :                                    .add(*Data0)                   // data0
     621             :                                    .add(*Data1)                   // data1
     622        2064 :                                    .addImm(NewOffset0)            // offset0
     623        2064 :                                    .addImm(NewOffset1)            // offset1
     624             :                                    .addImm(0)                     // gds
     625        4128 :                                    .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     626             : 
     627        2064 :   moveInstsAfter(Write2, CI.InstsToMove);
     628             : 
     629        2064 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     630        2064 :   CI.I->eraseFromParent();
     631        2064 :   CI.Paired->eraseFromParent();
     632             : 
     633             :   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
     634        2064 :   return Next;
     635             : }
     636             : 
     637           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     638             :   CombineInfo &CI) {
     639           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     640             :   DebugLoc DL = CI.I->getDebugLoc();
     641           0 :   unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
     642             :                               AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     643             : 
     644             :   const TargetRegisterClass *SuperRC =
     645           0 :     CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
     646           0 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     647           0 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     648             : 
     649           0 :   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
     650           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
     651           0 :       .addImm(MergedOffset) // offset
     652           0 :       .addImm(CI.GLC0)      // glc
     653           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     654             : 
     655           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     656           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     657             : 
     658             :   // Handle descending offsets
     659           0 :   if (CI.Offset0 > CI.Offset1)
     660             :     std::swap(SubRegIdx0, SubRegIdx1);
     661             : 
     662             :   // Copy to the old destination registers.
     663           0 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     664           0 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
     665           0 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
     666             : 
     667           0 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     668             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     669           0 :       .addReg(DestReg, 0, SubRegIdx0);
     670           0 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     671             :                             .add(*Dest1)
     672           0 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     673             : 
     674           0 :   moveInstsAfter(Copy1, CI.InstsToMove);
     675             : 
     676           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     677           0 :   CI.I->eraseFromParent();
     678           0 :   CI.Paired->eraseFromParent();
     679           0 :   return Next;
     680             : }
     681             : 
     682           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     683             :   CombineInfo &CI) {
     684           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     685             :   DebugLoc DL = CI.I->getDebugLoc();
     686             :   unsigned Opcode;
     687             : 
     688           0 :   if (CI.InstClass == BUFFER_LOAD_OFFEN) {
     689           0 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
     690             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
     691             :   } else {
     692           0 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
     693             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     694             :   }
     695             : 
     696             :   const TargetRegisterClass *SuperRC =
     697           0 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     698           0 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     699           0 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     700             : 
     701           0 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
     702             : 
     703           0 :   if (CI.InstClass == BUFFER_LOAD_OFFEN)
     704           0 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     705             : 
     706           0 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     707           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     708           0 :       .addImm(MergedOffset) // offset
     709           0 :       .addImm(CI.GLC0)      // glc
     710           0 :       .addImm(CI.SLC0)      // slc
     711             :       .addImm(0)            // tfe
     712           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     713             : 
     714           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     715           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     716             : 
     717             :   // Handle descending offsets
     718           0 :   if (CI.Offset0 > CI.Offset1)
     719             :     std::swap(SubRegIdx0, SubRegIdx1);
     720             : 
     721             :   // Copy to the old destination registers.
     722           0 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     723           0 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     724           0 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     725             : 
     726           0 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     727             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     728           0 :       .addReg(DestReg, 0, SubRegIdx0);
     729           0 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     730             :                             .add(*Dest1)
     731           0 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     732             : 
     733           0 :   moveInstsAfter(Copy1, CI.InstsToMove);
     734             : 
     735           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     736           0 :   CI.I->eraseFromParent();
     737           0 :   CI.Paired->eraseFromParent();
     738           0 :   return Next;
     739             : }
     740             : 
     741           0 : unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
     742             :   const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
     743           0 :   IsX2 = false;
     744           0 :   IsOffen = false;
     745             : 
     746           0 :   switch (I.getOpcode()) {
     747           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
     748           0 :     IsOffen = true;
     749           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
     750           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
     751           0 :     IsOffen = true;
     752           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
     753           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
     754           0 :     IsX2 = true;
     755           0 :     IsOffen = true;
     756           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
     757           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
     758           0 :     IsX2 = true;
     759           0 :     IsOffen = true;
     760           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
     761             :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
     762             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
     763           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
     764           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
     765           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
     766           0 :     IsX2 = true;
     767           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
     768           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
     769           0 :     IsX2 = true;
     770           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
     771             :   }
     772           0 :   return 0;
     773             : }
     774             : 
     775           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     776             :   CombineInfo &CI) {
     777           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     778             :   DebugLoc DL = CI.I->getDebugLoc();
     779             :   bool Unused1, Unused2;
     780           0 :   unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
     781             : 
     782           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     783           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     784             : 
     785             :   // Handle descending offsets
     786           0 :   if (CI.Offset0 > CI.Offset1)
     787             :     std::swap(SubRegIdx0, SubRegIdx1);
     788             : 
     789             :   // Copy to the new source register.
     790             :   const TargetRegisterClass *SuperRC =
     791           0 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     792           0 :   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
     793             : 
     794           0 :   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     795           0 :   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     796             : 
     797           0 :   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
     798             :       .add(*Src0)
     799           0 :       .addImm(SubRegIdx0)
     800             :       .add(*Src1)
     801           0 :       .addImm(SubRegIdx1);
     802             : 
     803           0 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
     804           0 :       .addReg(SrcReg, RegState::Kill);
     805             : 
     806           0 :   if (CI.InstClass == BUFFER_STORE_OFFEN)
     807           0 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     808             : 
     809           0 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     810           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     811           0 :       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
     812           0 :       .addImm(CI.GLC0)                          // glc
     813           0 :       .addImm(CI.SLC0)                          // slc
     814             :       .addImm(0)                                // tfe
     815           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     816             : 
     817           0 :   moveInstsAfter(MIB, CI.InstsToMove);
     818             : 
     819           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     820           0 :   CI.I->eraseFromParent();
     821           0 :   CI.Paired->eraseFromParent();
     822           0 :   return Next;
     823             : }
     824             : 
     825             : // Scan through looking for adjacent LDS operations with constant offsets from
     826             : // the same base register. We rely on the scheduler to do the hard work of
     827             : // clustering nearby loads, and assume these are all adjacent.
     828       21467 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
     829             :   bool Modified = false;
     830             : 
     831      460485 :   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     832             :     MachineInstr &MI = *I;
     833             : 
     834             :     // Don't combine if volatile.
     835      439018 :     if (MI.hasOrderedMemoryRef()) {
     836             :       ++I;
     837       32479 :       continue;
     838             :     }
     839             : 
     840             :     CombineInfo CI;
     841      423785 :     CI.I = I;
     842      423785 :     unsigned Opc = MI.getOpcode();
     843      423785 :     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
     844      422327 :         Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
     845             : 
     846        1797 :       CI.InstClass = DS_READ_WRITE;
     847        1797 :       CI.EltSize =
     848        1797 :         (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
     849             : 
     850        1797 :       if (findMatchingInst(CI)) {
     851             :         Modified = true;
     852         966 :         I = mergeRead2Pair(CI);
     853             :       } else {
     854             :         ++I;
     855             :       }
     856             : 
     857        1797 :       continue;
     858      421988 :     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
     859      840242 :                Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
     860      420121 :                Opc == AMDGPU::DS_WRITE_B64_gfx9) {
     861        2636 :       CI.InstClass = DS_READ_WRITE;
     862             :       CI.EltSize
     863        2636 :         = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
     864             : 
     865        2636 :       if (findMatchingInst(CI)) {
     866             :         Modified = true;
     867        2064 :         I = mergeWrite2Pair(CI);
     868             :       } else {
     869             :         ++I;
     870             :       }
     871             : 
     872        2636 :       continue;
     873             :     }
     874      838704 :     if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
     875      419352 :         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
     876             :       // EltSize is in units of the offset encoding.
     877         567 :       CI.InstClass = S_BUFFER_LOAD_IMM;
     878         567 :       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
     879         567 :       CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     880         567 :       if (findMatchingInst(CI)) {
     881             :         Modified = true;
     882         187 :         I = mergeSBufferLoadImmPair(CI);
     883         187 :         if (!CI.IsX2)
     884         145 :           CreatedX2++;
     885             :       } else {
     886             :         ++I;
     887             :       }
     888         567 :       continue;
     889             :     }
     890      837570 :     if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     891      418785 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     892      833164 :         Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
     893      416582 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
     894        3631 :       if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     895             :           Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
     896        2203 :         CI.InstClass = BUFFER_LOAD_OFFEN;
     897             :       else
     898        1428 :         CI.InstClass = BUFFER_LOAD_OFFSET;
     899             : 
     900        3631 :       CI.EltSize = 4;
     901        3631 :       CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     902        3631 :                 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     903        3631 :       if (findMatchingInst(CI)) {
     904             :         Modified = true;
     905          92 :         I = mergeBufferLoadPair(CI);
     906          92 :         if (!CI.IsX2)
     907          59 :           CreatedX2++;
     908             :       } else {
     909             :         ++I;
     910             :       }
     911        3631 :       continue;
     912             :     }
     913             : 
     914             :     bool StoreIsX2, IsOffen;
     915      415154 :     if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
     916        8615 :       CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
     917        8615 :       CI.EltSize = 4;
     918        8615 :       CI.IsX2 = StoreIsX2;
     919        8615 :       if (findMatchingInst(CI)) {
     920             :         Modified = true;
     921          66 :         I = mergeBufferStorePair(CI);
     922          66 :         if (!CI.IsX2)
     923          48 :           CreatedX2++;
     924             :       } else {
     925             :         ++I;
     926             :       }
     927        8615 :       continue;
     928             :     }
     929             : 
     930             :     ++I;
     931             :   }
     932             : 
     933       21467 :   return Modified;
     934             : }
     935             : 
     936       19240 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
     937       19240 :   if (skipFunction(MF.getFunction()))
     938             :     return false;
     939             : 
     940       19237 :   STM = &MF.getSubtarget<GCNSubtarget>();
     941       19237 :   if (!STM->loadStoreOptEnabled())
     942             :     return false;
     943             : 
     944       19236 :   TII = STM->getInstrInfo();
     945       19236 :   TRI = &TII->getRegisterInfo();
     946             : 
     947       19236 :   MRI = &MF.getRegInfo();
     948       19236 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     949             : 
     950             :   assert(MRI->isSSA() && "Must be run on SSA");
     951             : 
     952             :   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
     953             : 
     954             :   bool Modified = false;
     955             : 
     956       40643 :   for (MachineBasicBlock &MBB : MF) {
     957       21407 :     CreatedX2 = 0;
     958       21407 :     Modified |= optimizeBlock(MBB);
     959             : 
     960             :     // Run again to convert x2 to x4.
     961       21407 :     if (CreatedX2 >= 1)
     962          60 :       Modified |= optimizeBlock(MBB);
     963             :   }
     964             : 
     965             :   return Modified;
     966             : }

Generated by: LCOV version 1.13