LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SILoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 289 408 70.8 %
Date: 2018-10-20 13:21:21 Functions: 16 25 64.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This pass tries to fuse DS instructions with close by immediate offsets.
      11             : // This will fuse operations such as
      12             : //  ds_read_b32 v0, v2 offset:16
      13             : //  ds_read_b32 v1, v2 offset:32
      14             : // ==>
      15             : //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
      16             : //
      17             : // The same is done for certain SMEM and VMEM opcodes, e.g.:
      18             : //  s_buffer_load_dword s4, s[0:3], 4
      19             : //  s_buffer_load_dword s5, s[0:3], 8
      20             : // ==>
      21             : //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
      22             : //
      23             : //
      24             : // Future improvements:
      25             : //
      26             : // - This currently relies on the scheduler to place loads and stores next to
      27             : //   each other, and then only merges adjacent pairs of instructions. It would
      28             : //   be good to be more flexible with interleaved instructions, and possibly run
      29             : //   before scheduling. It currently missing stores of constants because loading
      30             : //   the constant into the data register is placed between the stores, although
      31             : //   this is arguably a scheduling problem.
      32             : //
      33             : // - Live interval recomputing seems inefficient. This currently only matches
      34             : //   one pair, and recomputes live intervals and moves on to the next pair. It
      35             : //   would be better to compute a list of all merges that need to occur.
      36             : //
      37             : // - With a list of instructions to process, we can also merge more. If a
      38             : //   cluster of loads have offsets that are too large to fit in the 8-bit
      39             : //   offsets, but are close enough to fit in the 8 bits, we can add to the base
      40             : //   pointer and use the new reduced offsets.
      41             : //
      42             : //===----------------------------------------------------------------------===//
      43             : 
      44             : #include "AMDGPU.h"
      45             : #include "AMDGPUSubtarget.h"
      46             : #include "SIInstrInfo.h"
      47             : #include "SIRegisterInfo.h"
      48             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      49             : #include "Utils/AMDGPUBaseInfo.h"
      50             : #include "llvm/ADT/ArrayRef.h"
      51             : #include "llvm/ADT/SmallVector.h"
      52             : #include "llvm/ADT/StringRef.h"
      53             : #include "llvm/Analysis/AliasAnalysis.h"
      54             : #include "llvm/CodeGen/MachineBasicBlock.h"
      55             : #include "llvm/CodeGen/MachineFunction.h"
      56             : #include "llvm/CodeGen/MachineFunctionPass.h"
      57             : #include "llvm/CodeGen/MachineInstr.h"
      58             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      59             : #include "llvm/CodeGen/MachineOperand.h"
      60             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      61             : #include "llvm/IR/DebugLoc.h"
      62             : #include "llvm/Pass.h"
      63             : #include "llvm/Support/Debug.h"
      64             : #include "llvm/Support/MathExtras.h"
      65             : #include "llvm/Support/raw_ostream.h"
      66             : #include <algorithm>
      67             : #include <cassert>
      68             : #include <cstdlib>
      69             : #include <iterator>
      70             : #include <utility>
      71             : 
      72             : using namespace llvm;
      73             : 
      74             : #define DEBUG_TYPE "si-load-store-opt"
      75             : 
      76             : namespace {
      77             : 
      78             : class SILoadStoreOptimizer : public MachineFunctionPass {
      79             :   enum InstClassEnum {
      80             :     DS_READ_WRITE,
      81             :     S_BUFFER_LOAD_IMM,
      82             :     BUFFER_LOAD_OFFEN,
      83             :     BUFFER_LOAD_OFFSET,
      84             :     BUFFER_STORE_OFFEN,
      85             :     BUFFER_STORE_OFFSET,
      86             :   };
      87             : 
      88             :   struct CombineInfo {
      89             :     MachineBasicBlock::iterator I;
      90             :     MachineBasicBlock::iterator Paired;
      91             :     unsigned EltSize;
      92             :     unsigned Offset0;
      93             :     unsigned Offset1;
      94             :     unsigned BaseOff;
      95             :     InstClassEnum InstClass;
      96             :     bool GLC0;
      97             :     bool GLC1;
      98             :     bool SLC0;
      99             :     bool SLC1;
     100             :     bool UseST64;
     101             :     bool IsX2;
     102             :     SmallVector<MachineInstr*, 8> InstsToMove;
     103             :    };
     104             : 
     105             : private:
     106             :   const GCNSubtarget *STM = nullptr;
     107             :   const SIInstrInfo *TII = nullptr;
     108             :   const SIRegisterInfo *TRI = nullptr;
     109             :   MachineRegisterInfo *MRI = nullptr;
     110             :   AliasAnalysis *AA = nullptr;
     111             :   unsigned CreatedX2;
     112             : 
     113             :   static bool offsetsCanBeCombined(CombineInfo &CI);
     114             : 
     115             :   bool findMatchingInst(CombineInfo &CI);
     116             : 
     117             :   unsigned read2Opcode(unsigned EltSize) const;
     118             :   unsigned read2ST64Opcode(unsigned EltSize) const;
     119             :   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
     120             : 
     121             :   unsigned write2Opcode(unsigned EltSize) const;
     122             :   unsigned write2ST64Opcode(unsigned EltSize) const;
     123             :   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
     124             :   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
     125             :   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
     126             :   unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
     127             :                                     bool &IsOffen) const;
     128             :   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
     129             : 
     130             : public:
     131             :   static char ID;
     132             : 
     133        1923 :   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     134        1923 :     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     135        1923 :   }
     136             : 
     137             :   bool optimizeBlock(MachineBasicBlock &MBB);
     138             : 
     139             :   bool runOnMachineFunction(MachineFunction &MF) override;
     140             : 
     141        1909 :   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
     142             : 
     143        1909 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     144        1909 :     AU.setPreservesCFG();
     145             :     AU.addRequired<AAResultsWrapperPass>();
     146             : 
     147        1909 :     MachineFunctionPass::getAnalysisUsage(AU);
     148        1909 :   }
     149             : };
     150             : 
     151             : } // end anonymous namespace.
     152             : 
     153       85105 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
     154             :                       "SI Load Store Optimizer", false, false)
     155       85105 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     156      200947 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
     157             :                     "SI Load Store Optimizer", false, false)
     158             : 
     159             : char SILoadStoreOptimizer::ID = 0;
     160             : 
     161             : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
     162             : 
     163           0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
     164           0 :   return new SILoadStoreOptimizer();
     165             : }
     166             : 
     167        3432 : static void moveInstsAfter(MachineBasicBlock::iterator I,
     168             :                            ArrayRef<MachineInstr*> InstsToMove) {
     169        3432 :   MachineBasicBlock *MBB = I->getParent();
     170             :   ++I;
     171        4493 :   for (MachineInstr *MI : InstsToMove) {
     172        1061 :     MI->removeFromParent();
     173             :     MBB->insert(I, MI);
     174             :   }
     175        3432 : }
     176             : 
     177        8291 : static void addDefsUsesToList(const MachineInstr &MI,
     178             :                               DenseSet<unsigned> &RegDefs,
     179             :                               DenseSet<unsigned> &PhysRegUses) {
     180       57523 :   for (const MachineOperand &Op : MI.operands()) {
     181       49232 :     if (Op.isReg()) {
     182       28474 :       if (Op.isDef())
     183        4621 :         RegDefs.insert(Op.getReg());
     184       23850 :       else if (Op.readsReg() &&
     185       23850 :                TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
     186        9658 :         PhysRegUses.insert(Op.getReg());
     187             :     }
     188             :   }
     189        8291 : }
     190             : 
     191       17105 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
     192             :                                       MachineBasicBlock::iterator B,
     193             :                                       const SIInstrInfo *TII,
     194             :                                       AliasAnalysis * AA) {
     195             :   // RAW or WAR - cannot reorder
     196             :   // WAW - cannot reorder
     197             :   // RAR - safe to reorder
     198       19703 :   return !(A->mayStore() || B->mayStore()) ||
     199        2598 :     TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
     200             : }
     201             : 
     202             : // Add MI and its defs to the lists if MI reads one of the defs that are
     203             : // already in the list. Returns true in that case.
     204             : static bool
     205       61380 : addToListsIfDependent(MachineInstr &MI,
     206             :                       DenseSet<unsigned> &RegDefs,
     207             :                       DenseSet<unsigned> &PhysRegUses,
     208             :                       SmallVectorImpl<MachineInstr*> &Insts) {
     209      370106 :   for (MachineOperand &Use : MI.operands()) {
     210             :     // If one of the defs is read, then there is a use of Def between I and the
     211             :     // instruction that I will potentially be merged with. We will need to move
     212             :     // this instruction after the merged instructions.
     213             :     //
     214             :     // Similarly, if there is a def which is read by an instruction that is to
     215             :     // be moved for merging, then we need to move the def-instruction as well.
     216             :     // This can only happen for physical registers such as M0; virtual
     217             :     // registers are in SSA form.
     218      311487 :     if (Use.isReg() &&
     219      318708 :         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
     220      113944 :          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
     221      309536 :           PhysRegUses.count(Use.getReg())))) {
     222        2761 :       Insts.push_back(&MI);
     223        2761 :       addDefsUsesToList(MI, RegDefs, PhysRegUses);
     224        2761 :       return true;
     225             :     }
     226             :   }
     227             : 
     228             :   return false;
     229             : }
     230             : 
     231             : static bool
     232       19619 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
     233             :                         ArrayRef<MachineInstr*> InstsToMove,
     234             :                         const SIInstrInfo *TII,
     235             :                         AliasAnalysis *AA) {
     236             :   assert(MemOp.mayLoadOrStore());
     237             : 
     238       43395 :   for (MachineInstr *InstToMove : InstsToMove) {
     239       23824 :     if (!InstToMove->mayLoadOrStore())
     240             :       continue;
     241          74 :     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
     242             :         return false;
     243             :   }
     244             :   return true;
     245             : }
     246             : 
     247        7166 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
     248             :   // XXX - Would the same offset be OK? Is there any reason this would happen or
     249             :   // be useful?
     250        7166 :   if (CI.Offset0 == CI.Offset1)
     251             :     return false;
     252             : 
     253             :   // This won't be valid if the offset isn't aligned.
     254        7142 :   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     255             :     return false;
     256             : 
     257        7142 :   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
     258        7142 :   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
     259        7142 :   CI.UseST64 = false;
     260        7142 :   CI.BaseOff = 0;
     261             : 
     262             :   // Handle SMEM and VMEM instructions.
     263        7142 :   if (CI.InstClass != DS_READ_WRITE) {
     264        4040 :     unsigned Diff = CI.IsX2 ? 2 : 1;
     265        7727 :     return (EltOffset0 + Diff == EltOffset1 ||
     266        3687 :             EltOffset1 + Diff == EltOffset0) &&
     267        4040 :            CI.GLC0 == CI.GLC1 &&
     268        4040 :            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
     269             :   }
     270             : 
     271             :   // If the offset in elements doesn't fit in 8-bits, we might be able to use
     272             :   // the stride 64 versions.
     273         280 :   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
     274        3144 :       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
     275          42 :     CI.Offset0 = EltOffset0 / 64;
     276          42 :     CI.Offset1 = EltOffset1 / 64;
     277          42 :     CI.UseST64 = true;
     278          42 :     return true;
     279             :   }
     280             : 
     281             :   // Check if the new offsets fit in the reduced 8-bit range.
     282        3060 :   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
     283        2990 :     CI.Offset0 = EltOffset0;
     284        2990 :     CI.Offset1 = EltOffset1;
     285        2990 :     return true;
     286             :   }
     287             : 
     288             :   // Try to shift base address to decrease offsets.
     289          70 :   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
     290          70 :   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
     291             : 
     292          70 :   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
     293          24 :     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
     294          24 :     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
     295          24 :     CI.UseST64 = true;
     296          24 :     return true;
     297             :   }
     298             : 
     299          46 :   if (isUInt<8>(OffsetDiff)) {
     300          37 :     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
     301          37 :     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
     302          37 :     return true;
     303             :   }
     304             : 
     305             :   return false;
     306             : }
     307             : 
     308       17465 : bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     309       17465 :   MachineBasicBlock *MBB = CI.I->getParent();
     310             :   MachineBasicBlock::iterator E = MBB->end();
     311             :   MachineBasicBlock::iterator MBBI = CI.I;
     312             : 
     313       17465 :   unsigned AddrOpName[3] = {0};
     314             :   int AddrIdx[3];
     315             :   const MachineOperand *AddrReg[3];
     316             :   unsigned NumAddresses = 0;
     317             : 
     318       17465 :   switch (CI.InstClass) {
     319        4503 :   case DS_READ_WRITE:
     320        4503 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
     321        4503 :     break;
     322         567 :   case S_BUFFER_LOAD_IMM:
     323         567 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
     324         567 :     break;
     325        5518 :   case BUFFER_LOAD_OFFEN:
     326             :   case BUFFER_STORE_OFFEN:
     327        5518 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     328        5518 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
     329        5518 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     330        5518 :     break;
     331        6877 :   case BUFFER_LOAD_OFFSET:
     332             :   case BUFFER_STORE_OFFSET:
     333        6877 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     334        6877 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     335        6877 :     break;
     336             :   }
     337             : 
     338       23096 :   for (unsigned i = 0; i < NumAddresses; i++) {
     339       36846 :     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     340       36846 :     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
     341             : 
     342             :     // We only ever merge operations with the same base address register, so don't
     343             :     // bother scanning forward if there are no other uses.
     344       18423 :     if (AddrReg[i]->isReg() &&
     345       29903 :         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
     346       12242 :          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
     347       12792 :       return false;
     348             :   }
     349             : 
     350             :   ++MBBI;
     351             : 
     352             :   DenseSet<unsigned> RegDefsToMove;
     353             :   DenseSet<unsigned> PhysRegUsesToMove;
     354        4673 :   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
     355             : 
     356       63443 :   for ( ; MBBI != E; ++MBBI) {
     357      186867 :     if (MBBI->getOpcode() != CI.I->getOpcode()) {
     358             :       // This is not a matching DS instruction, but we can keep looking as
     359             :       // long as one of these conditions are met:
     360             :       // 1. It is safe to move I down past MBBI.
     361             :       // 2. It is safe to move MBBI down past the instruction that I will
     362             :       //    be merged into.
     363             : 
     364       55051 :       if (MBBI->hasUnmodeledSideEffects()) {
     365             :         // We can't re-order this instruction with respect to other memory
     366             :         // operations, so we fail both conditions mentioned above.
     367             :         return false;
     368             :       }
     369             : 
     370       68268 :       if (MBBI->mayLoadOrStore() &&
     371       38960 :         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     372       24868 :          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
     373             :         // We fail condition #1, but we may still be able to satisfy condition
     374             :         // #2.  Add this instruction to the move list and then we will check
     375             :         // if condition #2 holds once we have selected the matching instruction.
     376         857 :         CI.InstsToMove.push_back(&*MBBI);
     377         857 :         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
     378         857 :         continue;
     379             :       }
     380             : 
     381             :       // When we match I with another DS instruction we will be moving I down
     382             :       // to the location of the matched instruction any uses of I will need to
     383             :       // be moved down as well.
     384       54148 :       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     385             :                             CI.InstsToMove);
     386       54148 :       continue;
     387             :     }
     388             : 
     389             :     // Don't merge volatiles.
     390        7238 :     if (MBBI->hasOrderedMemoryRef())
     391             :       return false;
     392             : 
     393             :     // Handle a case like
     394             :     //   DS_WRITE_B32 addr, v, idx0
     395             :     //   w = DS_READ_B32 addr, idx0
     396             :     //   DS_WRITE_B32 addr, f(w), idx1
     397             :     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     398             :     // merging of the two writes.
     399        7232 :     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
     400             :                               CI.InstsToMove))
     401             :       continue;
     402             : 
     403             :     bool Match = true;
     404       15544 :     for (unsigned i = 0; i < NumAddresses; i++) {
     405        8378 :       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
     406             : 
     407       16756 :       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
     408        1066 :         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
     409        1066 :             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
     410             :           Match = false;
     411             :           break;
     412             :         }
     413             :         continue;
     414             :       }
     415             : 
     416             :       // Check same base pointer. Be careful of subregisters, which can occur with
     417             :       // vectors of pointers.
     418        7312 :       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
     419             :           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
     420             :         Match = false;
     421             :         break;
     422             :       }
     423             :     }
     424             : 
     425        7200 :     if (Match) {
     426        7166 :       int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     427             :                                                  AMDGPU::OpName::offset);
     428        7166 :       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
     429        7166 :       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
     430        7166 :       CI.Paired = MBBI;
     431             : 
     432        7166 :       if (CI.InstClass == DS_READ_WRITE) {
     433        3118 :         CI.Offset0 &= 0xffff;
     434        3118 :         CI.Offset1 &= 0xffff;
     435             :       } else {
     436        4048 :         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
     437        4048 :         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
     438        4048 :         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
     439        1066 :           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
     440        1066 :           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
     441             :         }
     442             :       }
     443             : 
     444             :       // Check both offsets fit in the reduced range.
     445             :       // We also need to go through the list of instructions that we plan to
     446             :       // move and make sure they are all safe to move down past the merged
     447             :       // instruction.
     448        7166 :       if (offsetsCanBeCombined(CI))
     449        6876 :         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     450             :           return true;
     451             :     }
     452             : 
     453             :     // We've found a load/store that we couldn't merge for some reason.
     454             :     // We could potentially keep looking, but we'd need to make sure that
     455             :     // it was safe to move I and also all the instruction in InstsToMove
     456             :     // down past this instruction.
     457             :     // check if we can move I across MBBI and if we can move all I's users
     458       11283 :     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     459       11262 :         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     460             :       break;
     461             :   }
     462             :   return false;
     463             : }
     464             : 
     465           0 : unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
     466         949 :   if (STM->ldsRequiresM0Init())
     467         756 :     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
     468         193 :   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
     469             : }
     470             : 
     471           0 : unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
     472          38 :   if (STM->ldsRequiresM0Init())
     473          22 :     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
     474             : 
     475          16 :   return (EltSize == 4) ?
     476             :     AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
     477             : }
     478             : 
     479         987 : MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     480             :   CombineInfo &CI) {
     481         987 :   MachineBasicBlock *MBB = CI.I->getParent();
     482             : 
     483             :   // Be careful, since the addresses could be subregisters themselves in weird
     484             :   // cases, like vectors of pointers.
     485         987 :   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     486             : 
     487         987 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
     488         987 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
     489             : 
     490         987 :   unsigned NewOffset0 = CI.Offset0;
     491         987 :   unsigned NewOffset1 = CI.Offset1;
     492         987 :   unsigned Opc = CI.UseST64 ?
     493         987 :     read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
     494             : 
     495         987 :   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
     496         987 :   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
     497             : 
     498         987 :   if (NewOffset0 > NewOffset1) {
     499             :     // Canonicalize the merged instruction so the smaller offset comes first.
     500             :     std::swap(NewOffset0, NewOffset1);
     501             :     std::swap(SubRegIdx0, SubRegIdx1);
     502             :   }
     503             : 
     504             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     505             :          (NewOffset0 != NewOffset1) &&
     506             :          "Computed offset doesn't fit");
     507             : 
     508         987 :   const MCInstrDesc &Read2Desc = TII->get(Opc);
     509             : 
     510             :   const TargetRegisterClass *SuperRC
     511         987 :     = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
     512        1974 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     513             : 
     514             :   DebugLoc DL = CI.I->getDebugLoc();
     515             : 
     516         987 :   unsigned BaseReg = AddrReg->getReg();
     517             :   unsigned BaseSubReg = AddrReg->getSubReg();
     518             :   unsigned BaseRegFlags = 0;
     519         987 :   if (CI.BaseOff) {
     520          62 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     521          62 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     522          31 :       .addImm(CI.BaseOff);
     523             : 
     524          62 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     525             :     BaseRegFlags = RegState::Kill;
     526             : 
     527          62 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     528          31 :       .addReg(ImmReg)
     529          31 :       .addReg(AddrReg->getReg(), 0, BaseSubReg);
     530             :     BaseSubReg = 0;
     531             :   }
     532             : 
     533         987 :   MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
     534         987 :                         .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
     535         987 :                         .addImm(NewOffset0)                        // offset0
     536         987 :                         .addImm(NewOffset1)                        // offset1
     537             :                         .addImm(0)                                 // gds
     538        1974 :                         .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     539             : 
     540             :   (void)Read2;
     541             : 
     542         987 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     543             : 
     544             :   // Copy to the old destination registers.
     545         987 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     546             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     547         987 :       .addReg(DestReg, 0, SubRegIdx0);
     548         987 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     549             :                             .add(*Dest1)
     550         987 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     551             : 
     552         987 :   moveInstsAfter(Copy1, CI.InstsToMove);
     553             : 
     554         987 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     555         987 :   CI.I->eraseFromParent();
     556         987 :   CI.Paired->eraseFromParent();
     557             : 
     558             :   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
     559         987 :   return Next;
     560             : }
     561             : 
     562           0 : unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
     563        2072 :   if (STM->ldsRequiresM0Init())
     564        1463 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
     565         609 :   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
     566             : }
     567             : 
     568           0 : unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
     569          28 :   if (STM->ldsRequiresM0Init())
     570          14 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
     571             : 
     572          14 :   return (EltSize == 4) ?
     573             :     AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
     574             : }
     575             : 
     576        2100 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     577             :   CombineInfo &CI) {
     578        2100 :   MachineBasicBlock *MBB = CI.I->getParent();
     579             : 
     580             :   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
     581             :   // sure we preserve the subregister index and any register flags set on them.
     582        2100 :   const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     583        2100 :   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
     584             :   const MachineOperand *Data1
     585        2100 :     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
     586             : 
     587        2100 :   unsigned NewOffset0 = CI.Offset0;
     588        2100 :   unsigned NewOffset1 = CI.Offset1;
     589        2100 :   unsigned Opc = CI.UseST64 ?
     590        2100 :     write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
     591             : 
     592        2100 :   if (NewOffset0 > NewOffset1) {
     593             :     // Canonicalize the merged instruction so the smaller offset comes first.
     594             :     std::swap(NewOffset0, NewOffset1);
     595             :     std::swap(Data0, Data1);
     596             :   }
     597             : 
     598             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     599             :          (NewOffset0 != NewOffset1) &&
     600             :          "Computed offset doesn't fit");
     601             : 
     602        2100 :   const MCInstrDesc &Write2Desc = TII->get(Opc);
     603             :   DebugLoc DL = CI.I->getDebugLoc();
     604             : 
     605        2100 :   unsigned BaseReg = AddrReg->getReg();
     606             :   unsigned BaseSubReg = AddrReg->getSubReg();
     607             :   unsigned BaseRegFlags = 0;
     608        2100 :   if (CI.BaseOff) {
     609          60 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     610          60 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     611          30 :       .addImm(CI.BaseOff);
     612             : 
     613          60 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     614             :     BaseRegFlags = RegState::Kill;
     615             : 
     616          60 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     617          30 :       .addReg(ImmReg)
     618          30 :       .addReg(AddrReg->getReg(), 0, BaseSubReg);
     619             :     BaseSubReg = 0;
     620             :   }
     621             : 
     622        2100 :   MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
     623        2100 :                         .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
     624             :                         .add(*Data0)                               // data0
     625             :                         .add(*Data1)                               // data1
     626        2100 :                         .addImm(NewOffset0)                        // offset0
     627        2100 :                         .addImm(NewOffset1)                        // offset1
     628             :                         .addImm(0)                                 // gds
     629        4200 :                         .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     630             : 
     631        2100 :   moveInstsAfter(Write2, CI.InstsToMove);
     632             : 
     633        2100 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     634        2100 :   CI.I->eraseFromParent();
     635        2100 :   CI.Paired->eraseFromParent();
     636             : 
     637             :   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
     638        2100 :   return Next;
     639             : }
     640             : 
     641           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     642             :   CombineInfo &CI) {
     643           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     644             :   DebugLoc DL = CI.I->getDebugLoc();
     645           0 :   unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
     646             :                               AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     647             : 
     648             :   const TargetRegisterClass *SuperRC =
     649           0 :     CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
     650           0 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     651           0 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     652             : 
     653           0 :   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
     654           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
     655           0 :       .addImm(MergedOffset) // offset
     656           0 :       .addImm(CI.GLC0)      // glc
     657           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     658             : 
     659           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     660           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     661             : 
     662             :   // Handle descending offsets
     663           0 :   if (CI.Offset0 > CI.Offset1)
     664             :     std::swap(SubRegIdx0, SubRegIdx1);
     665             : 
     666             :   // Copy to the old destination registers.
     667           0 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     668           0 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
     669           0 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
     670             : 
     671           0 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     672             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     673           0 :       .addReg(DestReg, 0, SubRegIdx0);
     674           0 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     675             :                             .add(*Dest1)
     676           0 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     677             : 
     678           0 :   moveInstsAfter(Copy1, CI.InstsToMove);
     679             : 
     680           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     681           0 :   CI.I->eraseFromParent();
     682           0 :   CI.Paired->eraseFromParent();
     683           0 :   return Next;
     684             : }
     685             : 
     686           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     687             :   CombineInfo &CI) {
     688           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     689             :   DebugLoc DL = CI.I->getDebugLoc();
     690             :   unsigned Opcode;
     691             : 
     692           0 :   if (CI.InstClass == BUFFER_LOAD_OFFEN) {
     693           0 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
     694             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
     695             :   } else {
     696           0 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
     697             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     698             :   }
     699             : 
     700             :   const TargetRegisterClass *SuperRC =
     701           0 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     702           0 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     703           0 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     704             : 
     705           0 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
     706             : 
     707           0 :   if (CI.InstClass == BUFFER_LOAD_OFFEN)
     708           0 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     709             : 
     710           0 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     711           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     712           0 :       .addImm(MergedOffset) // offset
     713           0 :       .addImm(CI.GLC0)      // glc
     714           0 :       .addImm(CI.SLC0)      // slc
     715             :       .addImm(0)            // tfe
     716           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     717             : 
     718           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     719           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     720             : 
     721             :   // Handle descending offsets
     722           0 :   if (CI.Offset0 > CI.Offset1)
     723             :     std::swap(SubRegIdx0, SubRegIdx1);
     724             : 
     725             :   // Copy to the old destination registers.
     726           0 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     727           0 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     728           0 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     729             : 
     730           0 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     731             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     732           0 :       .addReg(DestReg, 0, SubRegIdx0);
     733           0 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     734             :                             .add(*Dest1)
     735           0 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     736             : 
     737           0 :   moveInstsAfter(Copy1, CI.InstsToMove);
     738             : 
     739           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     740           0 :   CI.I->eraseFromParent();
     741           0 :   CI.Paired->eraseFromParent();
     742           0 :   return Next;
     743             : }
     744             : 
     745           0 : unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
     746             :   const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
     747           0 :   IsX2 = false;
     748           0 :   IsOffen = false;
     749             : 
     750           0 :   switch (I.getOpcode()) {
     751           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
     752           0 :     IsOffen = true;
     753           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
     754           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
     755           0 :     IsOffen = true;
     756           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
     757           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
     758           0 :     IsX2 = true;
     759           0 :     IsOffen = true;
     760           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
     761           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
     762           0 :     IsX2 = true;
     763           0 :     IsOffen = true;
     764           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
     765             :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
     766             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
     767           0 :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
     768           0 :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
     769           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
     770           0 :     IsX2 = true;
     771           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
     772           0 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
     773           0 :     IsX2 = true;
     774           0 :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
     775             :   }
     776           0 :   return 0;
     777             : }
     778             : 
     779           0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     780             :   CombineInfo &CI) {
     781           0 :   MachineBasicBlock *MBB = CI.I->getParent();
     782             :   DebugLoc DL = CI.I->getDebugLoc();
     783             :   bool Unused1, Unused2;
     784           0 :   unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
     785             : 
     786           0 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     787           0 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     788             : 
     789             :   // Handle descending offsets
     790           0 :   if (CI.Offset0 > CI.Offset1)
     791             :     std::swap(SubRegIdx0, SubRegIdx1);
     792             : 
     793             :   // Copy to the new source register.
     794             :   const TargetRegisterClass *SuperRC =
     795           0 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     796           0 :   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
     797             : 
     798           0 :   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     799           0 :   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     800             : 
     801           0 :   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
     802             :       .add(*Src0)
     803           0 :       .addImm(SubRegIdx0)
     804             :       .add(*Src1)
     805           0 :       .addImm(SubRegIdx1);
     806             : 
     807           0 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
     808           0 :       .addReg(SrcReg, RegState::Kill);
     809             : 
     810           0 :   if (CI.InstClass == BUFFER_STORE_OFFEN)
     811           0 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     812             : 
     813           0 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     814           0 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     815           0 :       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
     816           0 :       .addImm(CI.GLC0)                          // glc
     817           0 :       .addImm(CI.SLC0)                          // slc
     818             :       .addImm(0)                                // tfe
     819           0 :       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
     820             : 
     821           0 :   moveInstsAfter(MIB, CI.InstsToMove);
     822             : 
     823           0 :   MachineBasicBlock::iterator Next = std::next(CI.I);
     824           0 :   CI.I->eraseFromParent();
     825           0 :   CI.Paired->eraseFromParent();
     826           0 :   return Next;
     827             : }
     828             : 
     829             : // Scan through looking for adjacent LDS operations with constant offsets from
     830             : // the same base register. We rely on the scheduler to do the hard work of
     831             : // clustering nearby loads, and assume these are all adjacent.
     832       22025 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
     833             :   bool Modified = false;
     834             : 
     835      468070 :   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     836             :     MachineInstr &MI = *I;
     837             : 
     838             :     // Don't combine if volatile.
     839      446045 :     if (MI.hasOrderedMemoryRef()) {
     840             :       ++I;
     841       33113 :       continue;
     842             :     }
     843             : 
     844             :     CombineInfo CI;
     845      430397 :     CI.I = I;
     846      430397 :     unsigned Opc = MI.getOpcode();
     847      430397 :     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
     848      428910 :         Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
     849             : 
     850        1827 :       CI.InstClass = DS_READ_WRITE;
     851        1827 :       CI.EltSize =
     852        1827 :         (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
     853             : 
     854        1827 :       if (findMatchingInst(CI)) {
     855             :         Modified = true;
     856         987 :         I = mergeRead2Pair(CI);
     857             :       } else {
     858             :         ++I;
     859             :       }
     860             : 
     861        1827 :       continue;
     862      428570 :     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
     863      853326 :                Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
     864      426663 :                Opc == AMDGPU::DS_WRITE_B64_gfx9) {
     865        2676 :       CI.InstClass = DS_READ_WRITE;
     866             :       CI.EltSize
     867        2676 :         = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
     868             : 
     869        2676 :       if (findMatchingInst(CI)) {
     870             :         Modified = true;
     871        2100 :         I = mergeWrite2Pair(CI);
     872             :       } else {
     873             :         ++I;
     874             :       }
     875             : 
     876        2676 :       continue;
     877             :     }
     878      851788 :     if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
     879      425894 :         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
     880             :       // EltSize is in units of the offset encoding.
     881         567 :       CI.InstClass = S_BUFFER_LOAD_IMM;
     882         567 :       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
     883         567 :       CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     884         567 :       if (findMatchingInst(CI)) {
     885             :         Modified = true;
     886         187 :         I = mergeSBufferLoadImmPair(CI);
     887         187 :         if (!CI.IsX2)
     888         145 :           CreatedX2++;
     889             :       } else {
     890             :         ++I;
     891             :       }
     892         567 :       continue;
     893             :     }
     894      850654 :     if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     895      425327 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     896      846256 :         Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
     897      423128 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
     898        3627 :       if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     899             :           Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
     900        2199 :         CI.InstClass = BUFFER_LOAD_OFFEN;
     901             :       else
     902        1428 :         CI.InstClass = BUFFER_LOAD_OFFSET;
     903             : 
     904        3627 :       CI.EltSize = 4;
     905        3627 :       CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     906        3627 :                 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     907        3627 :       if (findMatchingInst(CI)) {
     908             :         Modified = true;
     909          92 :         I = mergeBufferLoadPair(CI);
     910          92 :         if (!CI.IsX2)
     911          59 :           CreatedX2++;
     912             :       } else {
     913             :         ++I;
     914             :       }
     915        3627 :       continue;
     916             :     }
     917             : 
     918             :     bool StoreIsX2, IsOffen;
     919      421700 :     if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
     920        8768 :       CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
     921        8768 :       CI.EltSize = 4;
     922        8768 :       CI.IsX2 = StoreIsX2;
     923        8768 :       if (findMatchingInst(CI)) {
     924             :         Modified = true;
     925          66 :         I = mergeBufferStorePair(CI);
     926          66 :         if (!CI.IsX2)
     927          48 :           CreatedX2++;
     928             :       } else {
     929             :         ++I;
     930             :       }
     931        8768 :       continue;
     932             :     }
     933             : 
     934             :     ++I;
     935             :   }
     936             : 
     937       22025 :   return Modified;
     938             : }
     939             : 
     940       19540 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
     941       19540 :   if (skipFunction(MF.getFunction()))
     942             :     return false;
     943             : 
     944       19537 :   STM = &MF.getSubtarget<GCNSubtarget>();
     945       19537 :   if (!STM->loadStoreOptEnabled())
     946             :     return false;
     947             : 
     948       19536 :   TII = STM->getInstrInfo();
     949       19536 :   TRI = &TII->getRegisterInfo();
     950             : 
     951       19536 :   MRI = &MF.getRegInfo();
     952       19536 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     953             : 
     954             :   assert(MRI->isSSA() && "Must be run on SSA");
     955             : 
     956             :   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
     957             : 
     958             :   bool Modified = false;
     959             : 
     960       41501 :   for (MachineBasicBlock &MBB : MF) {
     961       21965 :     CreatedX2 = 0;
     962       21965 :     Modified |= optimizeBlock(MBB);
     963             : 
     964             :     // Run again to convert x2 to x4.
     965       21965 :     if (CreatedX2 >= 1)
     966          60 :       Modified |= optimizeBlock(MBB);
     967             :   }
     968             : 
     969             :   return Modified;
     970             : }

Generated by: LCOV version 1.13