LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SILoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 393 395 99.5 %
Date: 2018-06-17 00:07:59 Functions: 21 23 91.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This pass tries to fuse DS instructions with close by immediate offsets.
      11             : // This will fuse operations such as
      12             : //  ds_read_b32 v0, v2 offset:16
      13             : //  ds_read_b32 v1, v2 offset:32
      14             : // ==>
      15             : //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
      16             : //
      17             : // The same is done for certain SMEM and VMEM opcodes, e.g.:
      18             : //  s_buffer_load_dword s4, s[0:3], 4
      19             : //  s_buffer_load_dword s5, s[0:3], 8
      20             : // ==>
      21             : //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
      22             : //
      23             : //
      24             : // Future improvements:
      25             : //
      26             : // - This currently relies on the scheduler to place loads and stores next to
      27             : //   each other, and then only merges adjacent pairs of instructions. It would
      28             : //   be good to be more flexible with interleaved instructions, and possibly run
      29             : //   before scheduling. It currently missing stores of constants because loading
      30             : //   the constant into the data register is placed between the stores, although
      31             : //   this is arguably a scheduling problem.
      32             : //
      33             : // - Live interval recomputing seems inefficient. This currently only matches
      34             : //   one pair, and recomputes live intervals and moves on to the next pair. It
      35             : //   would be better to compute a list of all merges that need to occur.
      36             : //
      37             : // - With a list of instructions to process, we can also merge more. If a
      38             : //   cluster of loads have offsets that are too large to fit in the 8-bit
      39             : //   offsets, but are close enough to fit in the 8 bits, we can add to the base
      40             : //   pointer and use the new reduced offsets.
      41             : //
      42             : //===----------------------------------------------------------------------===//
      43             : 
      44             : #include "AMDGPU.h"
      45             : #include "AMDGPUSubtarget.h"
      46             : #include "SIInstrInfo.h"
      47             : #include "SIRegisterInfo.h"
      48             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      49             : #include "Utils/AMDGPUBaseInfo.h"
      50             : #include "llvm/ADT/ArrayRef.h"
      51             : #include "llvm/ADT/SmallVector.h"
      52             : #include "llvm/ADT/StringRef.h"
      53             : #include "llvm/Analysis/AliasAnalysis.h"
      54             : #include "llvm/CodeGen/MachineBasicBlock.h"
      55             : #include "llvm/CodeGen/MachineFunction.h"
      56             : #include "llvm/CodeGen/MachineFunctionPass.h"
      57             : #include "llvm/CodeGen/MachineInstr.h"
      58             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      59             : #include "llvm/CodeGen/MachineOperand.h"
      60             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      61             : #include "llvm/IR/DebugLoc.h"
      62             : #include "llvm/Pass.h"
      63             : #include "llvm/Support/Debug.h"
      64             : #include "llvm/Support/MathExtras.h"
      65             : #include "llvm/Support/raw_ostream.h"
      66             : #include <algorithm>
      67             : #include <cassert>
      68             : #include <cstdlib>
      69             : #include <iterator>
      70             : #include <utility>
      71             : 
      72             : using namespace llvm;
      73             : 
      74             : #define DEBUG_TYPE "si-load-store-opt"
      75             : 
      76             : namespace {
      77             : 
      78        1746 : class SILoadStoreOptimizer : public MachineFunctionPass {
      79             :   enum InstClassEnum {
      80             :     DS_READ_WRITE,
      81             :     S_BUFFER_LOAD_IMM,
      82             :     BUFFER_LOAD_OFFEN,
      83             :     BUFFER_LOAD_OFFSET,
      84             :     BUFFER_STORE_OFFEN,
      85             :     BUFFER_STORE_OFFSET,
      86             :   };
      87             : 
      88             :   struct CombineInfo {
      89             :     MachineBasicBlock::iterator I;
      90             :     MachineBasicBlock::iterator Paired;
      91             :     unsigned EltSize;
      92             :     unsigned Offset0;
      93             :     unsigned Offset1;
      94             :     unsigned BaseOff;
      95             :     InstClassEnum InstClass;
      96             :     bool GLC0;
      97             :     bool GLC1;
      98             :     bool SLC0;
      99             :     bool SLC1;
     100             :     bool UseST64;
     101             :     bool IsX2;
     102             :     SmallVector<MachineInstr*, 8> InstsToMove;
     103             :    };
     104             : 
     105             : private:
     106             :   const SISubtarget *STM = nullptr;
     107             :   const SIInstrInfo *TII = nullptr;
     108             :   const SIRegisterInfo *TRI = nullptr;
     109             :   MachineRegisterInfo *MRI = nullptr;
     110             :   AliasAnalysis *AA = nullptr;
     111             :   unsigned CreatedX2;
     112             : 
     113             :   static bool offsetsCanBeCombined(CombineInfo &CI);
     114             : 
     115             :   bool findMatchingInst(CombineInfo &CI);
     116             : 
     117             :   unsigned read2Opcode(unsigned EltSize) const;
     118             :   unsigned read2ST64Opcode(unsigned EltSize) const;
     119             :   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
     120             : 
     121             :   unsigned write2Opcode(unsigned EltSize) const;
     122             :   unsigned write2ST64Opcode(unsigned EltSize) const;
     123             :   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
     124             :   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
     125             :   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
     126             :   unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
     127             :                                     bool &IsOffen) const;
     128             :   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
     129             : 
     130             : public:
     131             :   static char ID;
     132             : 
     133        1754 :   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     134        1754 :     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     135        1754 :   }
     136             : 
     137             :   bool optimizeBlock(MachineBasicBlock &MBB);
     138             : 
     139             :   bool runOnMachineFunction(MachineFunction &MF) override;
     140             : 
     141        1742 :   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
     142             : 
     143        1742 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     144        1742 :     AU.setPreservesCFG();
     145             :     AU.addRequired<AAResultsWrapperPass>();
     146             : 
     147        1742 :     MachineFunctionPass::getAnalysisUsage(AU);
     148        1742 :   }
     149             : };
     150             : 
     151             : } // end anonymous namespace.
     152             : 
     153       76336 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
     154             :                       "SI Load Store Optimizer", false, false)
     155       76336 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     156      360592 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
     157             :                     "SI Load Store Optimizer", false, false)
     158             : 
     159             : char SILoadStoreOptimizer::ID = 0;
     160             : 
     161             : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
     162             : 
     163           0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
     164           0 :   return new SILoadStoreOptimizer();
     165             : }
     166             : 
     167        3264 : static void moveInstsAfter(MachineBasicBlock::iterator I,
     168             :                            ArrayRef<MachineInstr*> InstsToMove) {
     169        3264 :   MachineBasicBlock *MBB = I->getParent();
     170             :   ++I;
     171        5378 :   for (MachineInstr *MI : InstsToMove) {
     172        1057 :     MI->removeFromParent();
     173             :     MBB->insert(I, MI);
     174             :   }
     175        3264 : }
     176             : 
     177        7924 : static void addDefsUsesToList(const MachineInstr &MI,
     178             :                               DenseSet<unsigned> &RegDefs,
     179             :                               DenseSet<unsigned> &PhysRegUses) {
     180      101382 :   for (const MachineOperand &Op : MI.operands()) {
     181       46729 :     if (Op.isReg()) {
     182       27145 :       if (Op.isDef())
     183        8974 :         RegDefs.insert(Op.getReg());
     184       22655 :       else if (Op.readsReg() &&
     185       22655 :                TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
     186       18402 :         PhysRegUses.insert(Op.getReg());
     187             :     }
     188             :   }
     189        7924 : }
     190             : 
     191       16876 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
     192             :                                       MachineBasicBlock::iterator B,
     193             :                                       const SIInstrInfo *TII,
     194             :                                       AliasAnalysis * AA) {
     195             :   // RAW or WAR - cannot reorder
     196             :   // WAW - cannot reorder
     197             :   // RAR - safe to reorder
     198       19342 :   return !(A->mayStore() || B->mayStore()) ||
     199       19342 :     TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
     200             : }
     201             : 
     202             : // Add MI and its defs to the lists if MI reads one of the defs that are
     203             : // already in the list. Returns true in that case.
     204             : static bool
     205       61779 : addToListsIfDependent(MachineInstr &MI,
     206             :                       DenseSet<unsigned> &RegDefs,
     207             :                       DenseSet<unsigned> &PhysRegUses,
     208             :                       SmallVectorImpl<MachineInstr*> &Insts) {
     209      681195 :   for (MachineOperand &Use : MI.operands()) {
     210             :     // If one of the defs is read, then there is a use of Def between I and the
     211             :     // instruction that I will potentially be merged with. We will need to move
     212             :     // this instruction after the merged instructions.
     213             :     //
     214             :     // Similarly, if there is a def which is read by an instruction that is to
     215             :     // be moved for merging, then we need to move the def-instruction as well.
     216             :     // This can only happen for physical registers such as M0; virtual
     217             :     // registers are in SSA form.
     218      312429 :     if (Use.isReg() &&
     219      633351 :         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
     220      115128 :          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
     221      313221 :           PhysRegUses.count(Use.getReg())))) {
     222        2721 :       Insts.push_back(&MI);
     223        2721 :       addDefsUsesToList(MI, RegDefs, PhysRegUses);
     224        2721 :       return true;
     225             :     }
     226             :   }
     227             : 
     228             :   return false;
     229             : }
     230             : 
     231             : static bool
     232       19308 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
     233             :                         ArrayRef<MachineInstr*> InstsToMove,
     234             :                         const SIInstrInfo *TII,
     235             :                         AliasAnalysis *AA) {
     236             :   assert(MemOp.mayLoadOrStore());
     237             : 
     238       69696 :   for (MachineInstr *InstToMove : InstsToMove) {
     239       25241 :     if (!InstToMove->mayLoadOrStore())
     240       25178 :       continue;
     241          63 :     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
     242             :         return false;
     243             :   }
     244             :   return true;
     245             : }
     246             : 
     247        6986 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
     248             :   // XXX - Would the same offset be OK? Is there any reason this would happen or
     249             :   // be useful?
     250        6986 :   if (CI.Offset0 == CI.Offset1)
     251             :     return false;
     252             : 
     253             :   // This won't be valid if the offset isn't aligned.
     254        6962 :   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     255             :     return false;
     256             : 
     257        6962 :   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
     258        6962 :   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
     259        6962 :   CI.UseST64 = false;
     260        6962 :   CI.BaseOff = 0;
     261             : 
     262             :   // Handle SMEM and VMEM instructions.
     263        6962 :   if (CI.InstClass != DS_READ_WRITE) {
     264        3984 :     unsigned Diff = CI.IsX2 ? 2 : 1;
     265        7663 :     return (EltOffset0 + Diff == EltOffset1 ||
     266        3984 :             EltOffset1 + Diff == EltOffset0) &&
     267        4590 :            CI.GLC0 == CI.GLC1 &&
     268         114 :            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
     269             :   }
     270             : 
     271             :   // If the offset in elements doesn't fit in 8-bits, we might be able to use
     272             :   // the stride 64 versions.
     273        3281 :   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
     274        3062 :       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
     275          42 :     CI.Offset0 = EltOffset0 / 64;
     276          42 :     CI.Offset1 = EltOffset1 / 64;
     277          42 :     CI.UseST64 = true;
     278          42 :     return true;
     279             :   }
     280             : 
     281             :   // Check if the new offsets fit in the reduced 8-bit range.
     282        2936 :   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
     283        2871 :     CI.Offset0 = EltOffset0;
     284        2871 :     CI.Offset1 = EltOffset1;
     285        2871 :     return true;
     286             :   }
     287             : 
     288             :   // Try to shift base address to decrease offsets.
     289          65 :   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
     290         130 :   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
     291             : 
     292          65 :   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
     293          24 :     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
     294          24 :     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
     295          24 :     CI.UseST64 = true;
     296          24 :     return true;
     297             :   }
     298             : 
     299          41 :   if (isUInt<8>(OffsetDiff)) {
     300          32 :     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
     301          32 :     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
     302          32 :     return true;
     303             :   }
     304             : 
     305             :   return false;
     306             : }
     307             : 
     308       16955 : bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     309       16955 :   MachineBasicBlock *MBB = CI.I->getParent();
     310             :   MachineBasicBlock::iterator E = MBB->end();
     311       16955 :   MachineBasicBlock::iterator MBBI = CI.I;
     312             : 
     313       16955 :   unsigned AddrOpName[3] = {0};
     314             :   int AddrIdx[3];
     315             :   const MachineOperand *AddrReg[3];
     316             :   unsigned NumAddresses = 0;
     317             : 
     318       16955 :   switch (CI.InstClass) {
     319        4358 :   case DS_READ_WRITE:
     320        4358 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
     321        4358 :     break;
     322         553 :   case S_BUFFER_LOAD_IMM:
     323         553 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
     324         553 :     break;
     325        5498 :   case BUFFER_LOAD_OFFEN:
     326             :   case BUFFER_STORE_OFFEN:
     327        5498 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     328        5498 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
     329        5498 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     330        5498 :     break;
     331        6546 :   case BUFFER_LOAD_OFFSET:
     332             :   case BUFFER_STORE_OFFSET:
     333        6546 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     334        6546 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     335        6546 :     break;
     336             :   }
     337             : 
     338       27335 :   for (unsigned i = 0; i < NumAddresses; i++) {
     339       53172 :     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     340       35448 :     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
     341             : 
     342             :     // We only ever merge operations with the same base address register, so don't
     343             :     // bother scanning forward if there are no other uses.
     344       34803 :     if (AddrReg[i]->isReg() &&
     345       28711 :         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
     346       11632 :          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
     347             :       return false;
     348             :   }
     349             : 
     350             :   ++MBBI;
     351             : 
     352             :   DenseSet<unsigned> RegDefsToMove;
     353             :   DenseSet<unsigned> PhysRegUsesToMove;
     354        4421 :   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
     355             : 
     356       63684 :   for ( ; MBBI != E; ++MBBI) {
     357      242567 :     if (MBBI->getOpcode() != CI.I->getOpcode()) {
     358             :       // This is not a matching DS instruction, but we can keep looking as
     359             :       // long as one of these conditions are met:
     360             :       // 1. It is safe to move I down past MBBI.
     361             :       // 2. It is safe to move MBBI down past the instruction that I will
     362             :       //    be merged into.
     363             : 
     364       55558 :       if (MBBI->hasUnmodeledSideEffects()) {
     365             :         // We can't re-order this instruction with respect to other memory
     366             :         // operations, so we fail both conditions mentioned above.
     367             :         return false;
     368             :       }
     369             : 
     370       69356 :       if (MBBI->mayLoadOrStore() &&
     371       93941 :         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     372       80125 :          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
     373             :         // We fail condition #1, but we may still be able to satisfy condition
     374             :         // #2.  Add this instruction to the move list and then we will check
     375             :         // if condition #2 holds once we have selected the matching instruction.
     376        1564 :         CI.InstsToMove.push_back(&*MBBI);
     377         782 :         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
     378         782 :         continue;
     379             :       }
     380             : 
     381             :       // When we match I with another DS instruction we will be moving I down
     382             :       // to the location of the matched instruction any uses of I will need to
     383             :       // be moved down as well.
     384      109462 :       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     385             :                             CI.InstsToMove);
     386       54731 :       continue;
     387             :     }
     388             : 
     389             :     // Don't merge volatiles.
     390        7054 :     if (MBBI->hasOrderedMemoryRef())
     391             :       return false;
     392             : 
     393             :     // Handle a case like
     394             :     //   DS_WRITE_B32 addr, v, idx0
     395             :     //   w = DS_READ_B32 addr, idx0
     396             :     //   DS_WRITE_B32 addr, f(w), idx1
     397             :     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     398             :     // merging of the two writes.
     399       14096 :     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     400             :                               CI.InstsToMove))
     401          32 :       continue;
     402             : 
     403             :     bool Match = true;
     404       23144 :     for (unsigned i = 0; i < NumAddresses; i++) {
     405        8094 :       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
     406             : 
     407       24282 :       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
     408        2020 :         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
     409        1010 :             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
     410             :           Match = false;
     411             :           break;
     412             :         }
     413        1010 :         continue;
     414             :       }
     415             : 
     416             :       // Check same base pointer. Be careful of subregisters, which can occur with
     417             :       // vectors of pointers.
     418       14138 :       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
     419             :           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
     420             :         Match = false;
     421             :         break;
     422             :       }
     423             :     }
     424             : 
     425        7016 :     if (Match) {
     426       13972 :       int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     427             :                                                  AMDGPU::OpName::offset);
     428       13972 :       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
     429       13972 :       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
     430        6986 :       CI.Paired = MBBI;
     431             : 
     432        6986 :       if (CI.InstClass == DS_READ_WRITE) {
     433        2994 :         CI.Offset0 &= 0xffff;
     434        2994 :         CI.Offset1 &= 0xffff;
     435             :       } else {
     436        3992 :         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
     437        3992 :         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
     438        3992 :         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
     439        1010 :           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
     440        1010 :           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
     441             :         }
     442             :       }
     443             : 
     444             :       // Check both offsets fit in the reduced range.
     445             :       // We also need to go through the list of instructions that we plan to
     446             :       // move and make sure they are all safe to move down past the merged
     447             :       // instruction.
     448        6986 :       if (offsetsCanBeCombined(CI))
     449        6540 :         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     450             :           return true;
     451             :     }
     452             : 
     453             :     // We've found a load/store that we couldn't merge for some reason.
     454             :     // We could potentially keep looking, but we'd need to make sure that
     455             :     // it was safe to move I and also all the instruction in InstsToMove
     456             :     // down past this instruction.
     457             :     // check if we can move I across MBBI and if we can move all I's users
     458       14988 :     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     459       11216 :         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     460             :       break;
     461             :   }
     462             :   return false;
     463             : }
     464             : 
     465             : unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
     466         896 :   if (STM->ldsRequiresM0Init())
     467         713 :     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
     468         183 :   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
     469             : }
     470             : 
     471             : unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
     472          38 :   if (STM->ldsRequiresM0Init())
     473          22 :     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
     474             : 
     475          16 :   return (EltSize == 4) ?
     476             :     AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
     477             : }
     478             : 
     479         934 : MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     480             :   CombineInfo &CI) {
     481         934 :   MachineBasicBlock *MBB = CI.I->getParent();
     482             : 
     483             :   // Be careful, since the addresses could be subregisters themselves in weird
     484             :   // cases, like vectors of pointers.
     485         934 :   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     486             : 
     487         934 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
     488         934 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
     489             : 
     490         934 :   unsigned NewOffset0 = CI.Offset0;
     491         934 :   unsigned NewOffset1 = CI.Offset1;
     492        1868 :   unsigned Opc = CI.UseST64 ?
     493         934 :     read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
     494             : 
     495         934 :   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
     496         934 :   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
     497             : 
     498         934 :   if (NewOffset0 > NewOffset1) {
     499             :     // Canonicalize the merged instruction so the smaller offset comes first.
     500             :     std::swap(NewOffset0, NewOffset1);
     501             :     std::swap(SubRegIdx0, SubRegIdx1);
     502             :   }
     503             : 
     504             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     505             :          (NewOffset0 != NewOffset1) &&
     506             :          "Computed offset doesn't fit");
     507             : 
     508         934 :   const MCInstrDesc &Read2Desc = TII->get(Opc);
     509             : 
     510             :   const TargetRegisterClass *SuperRC
     511         934 :     = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
     512        1868 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     513             : 
     514             :   DebugLoc DL = CI.I->getDebugLoc();
     515             : 
     516         934 :   unsigned BaseReg = AddrReg->getReg();
     517             :   unsigned BaseRegFlags = 0;
     518         934 :   if (CI.BaseOff) {
     519          56 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     520          84 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     521          28 :       .addImm(CI.BaseOff);
     522             : 
     523          56 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     524             :     BaseRegFlags = RegState::Kill;
     525             : 
     526          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     527          28 :       .addReg(ImmReg)
     528          28 :       .addReg(AddrReg->getReg());
     529             :   }
     530             : 
     531             :   MachineInstrBuilder Read2 =
     532        1868 :     BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
     533         934 :       .addReg(BaseReg, BaseRegFlags) // addr
     534         934 :       .addImm(NewOffset0)            // offset0
     535         934 :       .addImm(NewOffset1)            // offset1
     536             :       .addImm(0)                     // gds
     537         934 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     538             : 
     539             :   (void)Read2;
     540             : 
     541         934 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     542             : 
     543             :   // Copy to the old destination registers.
     544        1868 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     545             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     546         934 :       .addReg(DestReg, 0, SubRegIdx0);
     547        1868 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     548             :                             .add(*Dest1)
     549         934 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     550             : 
     551         934 :   moveInstsAfter(Copy1, CI.InstsToMove);
     552             : 
     553             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     554         934 :   CI.I->eraseFromParent();
     555         934 :   CI.Paired->eraseFromParent();
     556             : 
     557             :   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
     558        1868 :   return Next;
     559             : }
     560             : 
     561             : unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
     562        2001 :   if (STM->ldsRequiresM0Init())
     563        1403 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
     564         598 :   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
     565             : }
     566             : 
     567             : unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
     568          28 :   if (STM->ldsRequiresM0Init())
     569          14 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
     570             : 
     571          14 :   return (EltSize == 4) ?
     572             :     AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
     573             : }
     574             : 
     575        2029 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     576             :   CombineInfo &CI) {
     577        2029 :   MachineBasicBlock *MBB = CI.I->getParent();
     578             : 
     579             :   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
     580             :   // sure we preserve the subregister index and any register flags set on them.
     581        2029 :   const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     582        2029 :   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
     583             :   const MachineOperand *Data1
     584        2029 :     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
     585             : 
     586        2029 :   unsigned NewOffset0 = CI.Offset0;
     587        2029 :   unsigned NewOffset1 = CI.Offset1;
     588        4058 :   unsigned Opc = CI.UseST64 ?
     589        2029 :     write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
     590             : 
     591        2029 :   if (NewOffset0 > NewOffset1) {
     592             :     // Canonicalize the merged instruction so the smaller offset comes first.
     593             :     std::swap(NewOffset0, NewOffset1);
     594             :     std::swap(Data0, Data1);
     595             :   }
     596             : 
     597             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     598             :          (NewOffset0 != NewOffset1) &&
     599             :          "Computed offset doesn't fit");
     600             : 
     601        2029 :   const MCInstrDesc &Write2Desc = TII->get(Opc);
     602             :   DebugLoc DL = CI.I->getDebugLoc();
     603             : 
     604        2029 :   unsigned BaseReg = AddrReg->getReg();
     605             :   unsigned BaseRegFlags = 0;
     606        2029 :   if (CI.BaseOff) {
     607          56 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     608          84 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     609          28 :       .addImm(CI.BaseOff);
     610             : 
     611          56 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     612             :     BaseRegFlags = RegState::Kill;
     613             : 
     614          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     615          28 :       .addReg(ImmReg)
     616          28 :       .addReg(AddrReg->getReg());
     617             :   }
     618             : 
     619             :   MachineInstrBuilder Write2 =
     620        4058 :     BuildMI(*MBB, CI.Paired, DL, Write2Desc)
     621        2029 :       .addReg(BaseReg, BaseRegFlags) // addr
     622             :       .add(*Data0)                   // data0
     623             :       .add(*Data1)                   // data1
     624        2029 :       .addImm(NewOffset0)            // offset0
     625        2029 :       .addImm(NewOffset1)            // offset1
     626             :       .addImm(0)                     // gds
     627        4058 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     628             : 
     629        2029 :   moveInstsAfter(Write2, CI.InstsToMove);
     630             : 
     631             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     632        2029 :   CI.I->eraseFromParent();
     633        2029 :   CI.Paired->eraseFromParent();
     634             : 
     635             :   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
     636        4058 :   return Next;
     637             : }
     638             : 
     639         187 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     640             :   CombineInfo &CI) {
     641         187 :   MachineBasicBlock *MBB = CI.I->getParent();
     642             :   DebugLoc DL = CI.I->getDebugLoc();
     643         187 :   unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
     644             :                               AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     645             : 
     646             :   const TargetRegisterClass *SuperRC =
     647         187 :     CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
     648         374 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     649         374 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     650             : 
     651         374 :   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
     652         374 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
     653         187 :       .addImm(MergedOffset) // offset
     654         187 :       .addImm(CI.GLC0)      // glc
     655         187 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     656             : 
     657         187 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     658         187 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     659             : 
     660             :   // Handle descending offsets
     661         187 :   if (CI.Offset0 > CI.Offset1)
     662             :     std::swap(SubRegIdx0, SubRegIdx1);
     663             : 
     664             :   // Copy to the old destination registers.
     665         187 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     666         187 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
     667         187 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
     668             : 
     669         374 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     670             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     671         187 :       .addReg(DestReg, 0, SubRegIdx0);
     672         374 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     673             :                             .add(*Dest1)
     674         187 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     675             : 
     676         187 :   moveInstsAfter(Copy1, CI.InstsToMove);
     677             : 
     678             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     679         187 :   CI.I->eraseFromParent();
     680         187 :   CI.Paired->eraseFromParent();
     681         374 :   return Next;
     682             : }
     683             : 
     684          74 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     685             :   CombineInfo &CI) {
     686          74 :   MachineBasicBlock *MBB = CI.I->getParent();
     687             :   DebugLoc DL = CI.I->getDebugLoc();
     688             :   unsigned Opcode;
     689             : 
     690          74 :   if (CI.InstClass == BUFFER_LOAD_OFFEN) {
     691          32 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
     692             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
     693             :   } else {
     694          42 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
     695             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     696             :   }
     697             : 
     698             :   const TargetRegisterClass *SuperRC =
     699          74 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     700         148 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     701         148 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     702             : 
     703         148 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
     704             : 
     705          74 :   if (CI.InstClass == BUFFER_LOAD_OFFEN)
     706          64 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     707             : 
     708         148 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     709         148 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     710          74 :       .addImm(MergedOffset) // offset
     711          74 :       .addImm(CI.GLC0)      // glc
     712          74 :       .addImm(CI.SLC0)      // slc
     713             :       .addImm(0)            // tfe
     714          74 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     715             : 
     716          74 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     717          74 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     718             : 
     719             :   // Handle descending offsets
     720          74 :   if (CI.Offset0 > CI.Offset1)
     721             :     std::swap(SubRegIdx0, SubRegIdx1);
     722             : 
     723             :   // Copy to the old destination registers.
     724          74 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     725          74 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     726          74 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     727             : 
     728         148 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     729             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     730          74 :       .addReg(DestReg, 0, SubRegIdx0);
     731         148 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     732             :                             .add(*Dest1)
     733          74 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     734             : 
     735          74 :   moveInstsAfter(Copy1, CI.InstsToMove);
     736             : 
     737             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     738          74 :   CI.I->eraseFromParent();
     739          74 :   CI.Paired->eraseFromParent();
     740         148 :   return Next;
     741             : }
     742             : 
     743      372799 : unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
     744             :   const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
     745      372799 :   IsX2 = false;
     746      372799 :   IsOffen = false;
     747             : 
     748      745598 :   switch (I.getOpcode()) {
     749        3275 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
     750        3275 :     IsOffen = true;
     751             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
     752          37 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
     753          37 :     IsOffen = true;
     754             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
     755          30 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
     756          30 :     IsX2 = true;
     757          30 :     IsOffen = true;
     758             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
     759          28 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
     760          28 :     IsX2 = true;
     761          28 :     IsOffen = true;
     762             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
     763             :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
     764             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
     765          19 :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
     766             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
     767        1668 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
     768        1668 :     IsX2 = true;
     769             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
     770          16 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
     771          16 :     IsX2 = true;
     772             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
     773             :   }
     774             :   return 0;
     775             : }
     776             : 
     777          40 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     778             :   CombineInfo &CI) {
     779          40 :   MachineBasicBlock *MBB = CI.I->getParent();
     780             :   DebugLoc DL = CI.I->getDebugLoc();
     781             :   bool Unused1, Unused2;
     782          40 :   unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
     783             : 
     784          40 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     785          40 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     786             : 
     787             :   // Handle descending offsets
     788          40 :   if (CI.Offset0 > CI.Offset1)
     789             :     std::swap(SubRegIdx0, SubRegIdx1);
     790             : 
     791             :   // Copy to the new source register.
     792             :   const TargetRegisterClass *SuperRC =
     793          40 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     794          80 :   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
     795             : 
     796          80 :   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     797          40 :   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     798             : 
     799          80 :   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
     800             :       .add(*Src0)
     801          40 :       .addImm(SubRegIdx0)
     802             :       .add(*Src1)
     803          40 :       .addImm(SubRegIdx1);
     804             : 
     805         120 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
     806          40 :       .addReg(SrcReg, RegState::Kill);
     807             : 
     808          40 :   if (CI.InstClass == BUFFER_STORE_OFFEN)
     809          32 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     810             : 
     811          80 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     812          80 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     813          80 :       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
     814          40 :       .addImm(CI.GLC0)      // glc
     815          40 :       .addImm(CI.SLC0)      // slc
     816             :       .addImm(0)            // tfe
     817          40 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     818             : 
     819          40 :   moveInstsAfter(MIB, CI.InstsToMove);
     820             : 
     821             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     822          40 :   CI.I->eraseFromParent();
     823          40 :   CI.Paired->eraseFromParent();
     824          80 :   return Next;
     825             : }
     826             : 
     827             : // Scan through looking for adjacent LDS operations with constant offsets from
     828             : // the same base register. We rely on the scheduler to do the hard work of
     829             : // clustering nearby loads, and assume these are all adjacent.
     830       19825 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
     831             :   bool Modified = false;
     832             : 
     833      434897 :   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     834             :     MachineInstr &MI = *I;
     835             : 
     836             :     // Don't combine if volatile.
     837      409292 :     if (MI.hasOrderedMemoryRef()) {
     838             :       ++I;
     839       45045 :       continue;
     840             :     }
     841             : 
     842             :     CombineInfo CI;
     843      381202 :     CI.I = I;
     844      381202 :     unsigned Opc = MI.getOpcode();
     845      382969 :     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
     846      379766 :         Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
     847             : 
     848        1767 :       CI.InstClass = DS_READ_WRITE;
     849        1767 :       CI.EltSize =
     850        1767 :         (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
     851             : 
     852        1767 :       if (findMatchingInst(CI)) {
     853             :         Modified = true;
     854         934 :         I = mergeRead2Pair(CI);
     855             :       } else {
     856             :         ++I;
     857             :       }
     858             : 
     859        1767 :       continue;
     860      382026 :     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
     861      755208 :                Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
     862      377604 :                Opc == AMDGPU::DS_WRITE_B64_gfx9) {
     863        2591 :       CI.InstClass = DS_READ_WRITE;
     864             :       CI.EltSize
     865        2591 :         = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
     866             : 
     867        2591 :       if (findMatchingInst(CI)) {
     868             :         Modified = true;
     869        2029 :         I = mergeWrite2Pair(CI);
     870             :       } else {
     871             :         ++I;
     872             :       }
     873             : 
     874        2591 :       continue;
     875             :     }
     876      754241 :     if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
     877      376844 :         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
     878             :       // EltSize is in units of the offset encoding.
     879         553 :       CI.InstClass = S_BUFFER_LOAD_IMM;
     880         553 :       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
     881         553 :       CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     882         553 :       if (findMatchingInst(CI)) {
     883             :         Modified = true;
     884         187 :         I = mergeSBufferLoadImmPair(CI);
     885         187 :         if (!CI.IsX2)
     886         145 :           CreatedX2++;
     887             :       } else {
     888             :         ++I;
     889             :       }
     890         553 :       continue;
     891             :     }
     892      756114 :     if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     893      376291 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     894      748294 :         Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
     895      374147 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
     896        3532 :       if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     897             :           Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
     898        2144 :         CI.InstClass = BUFFER_LOAD_OFFEN;
     899             :       else
     900        1388 :         CI.InstClass = BUFFER_LOAD_OFFSET;
     901             : 
     902        3532 :       CI.EltSize = 4;
     903        3532 :       CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     904        3532 :                 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     905        3532 :       if (findMatchingInst(CI)) {
     906             :         Modified = true;
     907          74 :         I = mergeBufferLoadPair(CI);
     908          74 :         if (!CI.IsX2)
     909          41 :           CreatedX2++;
     910             :       } else {
     911             :         ++I;
     912             :       }
     913        3532 :       continue;
     914             :     }
     915             : 
     916             :     bool StoreIsX2, IsOffen;
     917      381271 :     if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
     918        8512 :       CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
     919        8512 :       CI.EltSize = 4;
     920        8512 :       CI.IsX2 = StoreIsX2;
     921        8512 :       if (findMatchingInst(CI)) {
     922             :         Modified = true;
     923          40 :         I = mergeBufferStorePair(CI);
     924          40 :         if (!CI.IsX2)
     925          30 :           CreatedX2++;
     926             :       } else {
     927             :         ++I;
     928             :       }
     929        8512 :       continue;
     930             :     }
     931             : 
     932             :     ++I;
     933             :   }
     934             : 
     935       19825 :   return Modified;
     936             : }
     937             : 
     938       17667 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
     939       17667 :   if (skipFunction(MF.getFunction()))
     940             :     return false;
     941             : 
     942       17665 :   STM = &MF.getSubtarget<SISubtarget>();
     943       17665 :   if (!STM->loadStoreOptEnabled())
     944             :     return false;
     945             : 
     946       17664 :   TII = STM->getInstrInfo();
     947       17664 :   TRI = &TII->getRegisterInfo();
     948             : 
     949       17664 :   MRI = &MF.getRegInfo();
     950       35328 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     951             : 
     952             :   assert(MRI->isSSA() && "Must be run on SSA");
     953             : 
     954             :   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
     955             : 
     956             :   bool Modified = false;
     957             : 
     958       37441 :   for (MachineBasicBlock &MBB : MF) {
     959       19777 :     CreatedX2 = 0;
     960       19777 :     Modified |= optimizeBlock(MBB);
     961             : 
     962             :     // Run again to convert x2 to x4.
     963       19777 :     if (CreatedX2 >= 1)
     964          48 :       Modified |= optimizeBlock(MBB);
     965             :   }
     966             : 
     967             :   return Modified;
     968             : }

Generated by: LCOV version 1.13