LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SILoadStoreOptimizer.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 390 392 99.5 %
Date: 2018-02-23 15:42:53 Functions: 21 23 91.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This pass tries to fuse DS instructions with close by immediate offsets.
      11             : // This will fuse operations such as
      12             : //  ds_read_b32 v0, v2 offset:16
      13             : //  ds_read_b32 v1, v2 offset:32
      14             : // ==>
      15             : //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
      16             : //
      17             : // The same is done for certain SMEM and VMEM opcodes, e.g.:
      18             : //  s_buffer_load_dword s4, s[0:3], 4
      19             : //  s_buffer_load_dword s5, s[0:3], 8
      20             : // ==>
      21             : //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
      22             : //
      23             : //
      24             : // Future improvements:
      25             : //
      26             : // - This currently relies on the scheduler to place loads and stores next to
      27             : //   each other, and then only merges adjacent pairs of instructions. It would
      28             : //   be good to be more flexible with interleaved instructions, and possibly run
      29             : //   before scheduling. It currently missing stores of constants because loading
      30             : //   the constant into the data register is placed between the stores, although
      31             : //   this is arguably a scheduling problem.
      32             : //
      33             : // - Live interval recomputing seems inefficient. This currently only matches
      34             : //   one pair, and recomputes live intervals and moves on to the next pair. It
      35             : //   would be better to compute a list of all merges that need to occur.
      36             : //
      37             : // - With a list of instructions to process, we can also merge more. If a
      38             : //   cluster of loads have offsets that are too large to fit in the 8-bit
      39             : //   offsets, but are close enough to fit in the 8 bits, we can add to the base
      40             : //   pointer and use the new reduced offsets.
      41             : //
      42             : //===----------------------------------------------------------------------===//
      43             : 
      44             : #include "AMDGPU.h"
      45             : #include "AMDGPUSubtarget.h"
      46             : #include "SIInstrInfo.h"
      47             : #include "SIRegisterInfo.h"
      48             : #include "Utils/AMDGPUBaseInfo.h"
      49             : #include "llvm/ADT/ArrayRef.h"
      50             : #include "llvm/ADT/SmallVector.h"
      51             : #include "llvm/ADT/StringRef.h"
      52             : #include "llvm/Analysis/AliasAnalysis.h"
      53             : #include "llvm/CodeGen/MachineBasicBlock.h"
      54             : #include "llvm/CodeGen/MachineFunction.h"
      55             : #include "llvm/CodeGen/MachineFunctionPass.h"
      56             : #include "llvm/CodeGen/MachineInstr.h"
      57             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      58             : #include "llvm/CodeGen/MachineOperand.h"
      59             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      60             : #include "llvm/IR/DebugLoc.h"
      61             : #include "llvm/Pass.h"
      62             : #include "llvm/Support/Debug.h"
      63             : #include "llvm/Support/MathExtras.h"
      64             : #include "llvm/Support/raw_ostream.h"
      65             : #include <algorithm>
      66             : #include <cassert>
      67             : #include <cstdlib>
      68             : #include <iterator>
      69             : #include <utility>
      70             : 
      71             : using namespace llvm;
      72             : 
      73             : #define DEBUG_TYPE "si-load-store-opt"
      74             : 
      75             : namespace {
      76             : 
      77        1637 : class SILoadStoreOptimizer : public MachineFunctionPass {
      78             :   enum InstClassEnum {
      79             :     DS_READ_WRITE,
      80             :     S_BUFFER_LOAD_IMM,
      81             :     BUFFER_LOAD_OFFEN,
      82             :     BUFFER_LOAD_OFFSET,
      83             :     BUFFER_STORE_OFFEN,
      84             :     BUFFER_STORE_OFFSET,
      85             :   };
      86             : 
      87             :   struct CombineInfo {
      88             :     MachineBasicBlock::iterator I;
      89             :     MachineBasicBlock::iterator Paired;
      90             :     unsigned EltSize;
      91             :     unsigned Offset0;
      92             :     unsigned Offset1;
      93             :     unsigned BaseOff;
      94             :     InstClassEnum InstClass;
      95             :     bool GLC0;
      96             :     bool GLC1;
      97             :     bool SLC0;
      98             :     bool SLC1;
      99             :     bool UseST64;
     100             :     bool IsX2;
     101             :     SmallVector<MachineInstr*, 8> InstsToMove;
     102             :    };
     103             : 
     104             : private:
     105             :   const SISubtarget *STM = nullptr;
     106             :   const SIInstrInfo *TII = nullptr;
     107             :   const SIRegisterInfo *TRI = nullptr;
     108             :   MachineRegisterInfo *MRI = nullptr;
     109             :   AliasAnalysis *AA = nullptr;
     110             :   unsigned CreatedX2;
     111             : 
     112             :   static bool offsetsCanBeCombined(CombineInfo &CI);
     113             : 
     114             :   bool findMatchingInst(CombineInfo &CI);
     115             : 
     116             :   unsigned read2Opcode(unsigned EltSize) const;
     117             :   unsigned read2ST64Opcode(unsigned EltSize) const;
     118             :   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
     119             : 
     120             :   unsigned write2Opcode(unsigned EltSize) const;
     121             :   unsigned write2ST64Opcode(unsigned EltSize) const;
     122             :   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
     123             :   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
     124             :   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
     125             :   unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
     126             :                                     bool &IsOffen) const;
     127             :   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
     128             : 
     129             : public:
     130             :   static char ID;
     131             : 
     132        1645 :   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     133        1645 :     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     134        1645 :   }
     135             : 
     136             :   bool optimizeBlock(MachineBasicBlock &MBB);
     137             : 
     138             :   bool runOnMachineFunction(MachineFunction &MF) override;
     139             : 
     140        1635 :   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
     141             : 
     142        1635 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     143        1635 :     AU.setPreservesCFG();
     144             :     AU.addRequired<AAResultsWrapperPass>();
     145             : 
     146        1635 :     MachineFunctionPass::getAnalysisUsage(AU);
     147        1635 :   }
     148             : };
     149             : 
     150             : } // end anonymous namespace.
     151             : 
     152       59438 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
     153             :                       "SI Load Store Optimizer", false, false)
     154       59438 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     155      283018 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
     156             :                     "SI Load Store Optimizer", false, false)
     157             : 
     158             : char SILoadStoreOptimizer::ID = 0;
     159             : 
     160             : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
     161             : 
     162           0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
     163           0 :   return new SILoadStoreOptimizer();
     164             : }
     165             : 
     166        3026 : static void moveInstsAfter(MachineBasicBlock::iterator I,
     167             :                            ArrayRef<MachineInstr*> InstsToMove) {
     168        3026 :   MachineBasicBlock *MBB = I->getParent();
     169             :   ++I;
     170        5188 :   for (MachineInstr *MI : InstsToMove) {
     171        1081 :     MI->removeFromParent();
     172             :     MBB->insert(I, MI);
     173             :   }
     174        3026 : }
     175             : 
     176        7077 : static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
     177       87757 :   for (const MachineOperand &Def : MI.operands()) {
     178       63979 :     if (Def.isReg() && Def.isDef())
     179        7586 :       Defs.insert(Def.getReg());
     180             :   }
     181        7077 : }
     182             : 
     183       15571 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
     184             :                                       MachineBasicBlock::iterator B,
     185             :                                       const SIInstrInfo *TII,
     186             :                                       AliasAnalysis * AA) {
     187             :   // RAW or WAR - cannot reorder
     188             :   // WAW - cannot reorder
     189             :   // RAR - safe to reorder
     190       17485 :   return !(A->mayStore() || B->mayStore()) ||
     191       17485 :     TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
     192             : }
     193             : 
     194             : // Add MI and its defs to the lists if MI reads one of the defs that are
     195             : // already in the list. Returns true in that case.
     196             : static bool
     197       31633 : addToListsIfDependent(MachineInstr &MI,
     198             :                       DenseSet<unsigned> &Defs,
     199             :                       SmallVectorImpl<MachineInstr*> &Insts) {
     200      265121 :   for (MachineOperand &Use : MI.operands()) {
     201             :     // If one of the defs is read, then there is a use of Def between I and the
     202             :     // instruction that I will potentially be merged with. We will need to move
     203             :     // this instruction after the merged instructions.
     204             : 
     205      281690 :     if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
     206        2165 :       Insts.push_back(&MI);
     207        2165 :       addDefsToList(MI, Defs);
     208        2165 :       return true;
     209             :     }
     210             :   }
     211             : 
     212             :   return false;
     213             : }
     214             : 
     215             : static bool
     216       17787 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
     217             :                         ArrayRef<MachineInstr*> InstsToMove,
     218             :                         const SIInstrInfo *TII,
     219             :                         AliasAnalysis *AA) {
     220             :   assert(MemOp.mayLoadOrStore());
     221             : 
     222       65619 :   for (MachineInstr *InstToMove : InstsToMove) {
     223       23943 :     if (!InstToMove->mayLoadOrStore())
     224       23900 :       continue;
     225          43 :     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
     226             :         return false;
     227             :   }
     228             :   return true;
     229             : }
     230             : 
     231             : static bool
     232             : hasPhysRegDef(MachineInstr &MI) {
     233       83328 :   for (const MachineOperand &Def : MI.defs()) {
     234       54374 :     if (Def.isReg() &&
     235       27187 :         TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
     236             :       return true;
     237             :   }
     238             :   return false;
     239             : }
     240             : 
     241        6747 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
     242             :   // XXX - Would the same offset be OK? Is there any reason this would happen or
     243             :   // be useful?
     244        6747 :   if (CI.Offset0 == CI.Offset1)
     245             :     return false;
     246             : 
     247             :   // This won't be valid if the offset isn't aligned.
     248        6723 :   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     249             :     return false;
     250             : 
     251        6723 :   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
     252        6723 :   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
     253        6723 :   CI.UseST64 = false;
     254        6723 :   CI.BaseOff = 0;
     255             : 
     256             :   // Handle SMEM and VMEM instructions.
     257        6723 :   if (CI.InstClass != DS_READ_WRITE) {
     258        3982 :     unsigned Diff = CI.IsX2 ? 2 : 1;
     259        7660 :     return (EltOffset0 + Diff == EltOffset1 ||
     260        3982 :             EltOffset1 + Diff == EltOffset0) &&
     261        4586 :            CI.GLC0 == CI.GLC1 &&
     262         116 :            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
     263             :   }
     264             : 
     265             :   // If the offset in elements doesn't fit in 8-bits, we might be able to use
     266             :   // the stride 64 versions.
     267        3032 :   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
     268        2825 :       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
     269          42 :     CI.Offset0 = EltOffset0 / 64;
     270          42 :     CI.Offset1 = EltOffset1 / 64;
     271          42 :     CI.UseST64 = true;
     272          42 :     return true;
     273             :   }
     274             : 
     275             :   // Check if the new offsets fit in the reduced 8-bit range.
     276        2699 :   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
     277        2634 :     CI.Offset0 = EltOffset0;
     278        2634 :     CI.Offset1 = EltOffset1;
     279        2634 :     return true;
     280             :   }
     281             : 
     282             :   // Try to shift base address to decrease offsets.
     283          65 :   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
     284         130 :   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
     285             : 
     286          65 :   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
     287          24 :     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
     288          24 :     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
     289          24 :     CI.UseST64 = true;
     290          24 :     return true;
     291             :   }
     292             : 
     293          41 :   if (isUInt<8>(OffsetDiff)) {
     294          32 :     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
     295          32 :     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
     296          32 :     return true;
     297             :   }
     298             : 
     299             :   return false;
     300             : }
     301             : 
     302       16446 : bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     303       16446 :   MachineBasicBlock *MBB = CI.I->getParent();
     304             :   MachineBasicBlock::iterator E = MBB->end();
     305       16446 :   MachineBasicBlock::iterator MBBI = CI.I;
     306             : 
     307       16446 :   unsigned AddrOpName[3] = {0};
     308             :   int AddrIdx[3];
     309             :   const MachineOperand *AddrReg[3];
     310             :   unsigned NumAddresses = 0;
     311             : 
     312       16446 :   switch (CI.InstClass) {
     313        3902 :   case DS_READ_WRITE:
     314        3902 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
     315        3902 :     break;
     316         584 :   case S_BUFFER_LOAD_IMM:
     317         584 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
     318         584 :     break;
     319        5464 :   case BUFFER_LOAD_OFFEN:
     320             :   case BUFFER_STORE_OFFEN:
     321        5464 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     322        5464 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
     323        5464 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     324        5464 :     break;
     325        6496 :   case BUFFER_LOAD_OFFSET:
     326             :   case BUFFER_STORE_OFFSET:
     327        6496 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
     328        6496 :     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
     329        6496 :     break;
     330             :   }
     331             : 
     332       26300 :   for (unsigned i = 0; i < NumAddresses; i++) {
     333       51663 :     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     334       34442 :     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
     335             : 
     336             :     // We only ever merge operations with the same base address register, so don't
     337             :     // bother scanning forward if there are no other uses.
     338       33791 :     if (AddrReg[i]->isReg() &&
     339       27724 :         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
     340       11154 :          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
     341             :       return false;
     342             :   }
     343             : 
     344             :   ++MBBI;
     345             : 
     346             :   DenseSet<unsigned> DefsToMove;
     347        4152 :   addDefsToList(*CI.I, DefsToMove);
     348             : 
     349       33485 :   for ( ; MBBI != E; ++MBBI) {
     350      123174 :     if (MBBI->getOpcode() != CI.I->getOpcode()) {
     351             :       // This is not a matching DS instruction, but we can keep looking as
     352             :       // long as one of these conditions are met:
     353             :       // 1. It is safe to move I down past MBBI.
     354             :       // 2. It is safe to move MBBI down past the instruction that I will
     355             :       //    be merged into.
     356             : 
     357       25973 :       if (MBBI->hasUnmodeledSideEffects()) {
     358             :         // We can't re-order this instruction with respect to other memory
     359             :         // operations, so we fail both conditions mentioned above.
     360             :         return false;
     361             :       }
     362             : 
     363       25945 :       if (hasPhysRegDef(*MBBI)) {
     364             :         // We could re-order this instruction in theory, but it would require
     365             :         // tracking physreg defs and uses. This should only affect M0 in
     366             :         // practice.
     367             :         return false;
     368             :       }
     369             : 
     370       38128 :       if (MBBI->mayLoadOrStore() &&
     371       60169 :         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     372       47639 :          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
     373             :         // We fail condition #1, but we may still be able to satisfy condition
     374             :         // #2.  Add this instruction to the move list and then we will check
     375             :         // if condition #2 holds once we have selected the matching instruction.
     376        1520 :         CI.InstsToMove.push_back(&*MBBI);
     377         760 :         addDefsToList(*MBBI, DefsToMove);
     378         760 :         continue;
     379             :       }
     380             : 
     381             :       // When we match I with another DS instruction we will be moving I down
     382             :       // to the location of the matched instruction any uses of I will need to
     383             :       // be moved down as well.
     384       49662 :       addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
     385       24831 :       continue;
     386             :     }
     387             : 
     388             :     // Don't merge volatiles.
     389        6808 :     if (MBBI->hasOrderedMemoryRef())
     390             :       return false;
     391             : 
     392             :     // Handle a case like
     393             :     //   DS_WRITE_B32 addr, v, idx0
     394             :     //   w = DS_READ_B32 addr, idx0
     395             :     //   DS_WRITE_B32 addr, f(w), idx1
     396             :     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     397             :     // merging of the two writes.
     398       13604 :     if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
     399          25 :       continue;
     400             : 
     401             :     bool Match = true;
     402       22429 :     for (unsigned i = 0; i < NumAddresses; i++) {
     403        7856 :       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
     404             : 
     405       23568 :       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
     406        2022 :         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
     407        1011 :             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
     408             :           Match = false;
     409             :           break;
     410             :         }
     411        1011 :         continue;
     412             :       }
     413             : 
     414             :       // Check same base pointer. Be careful of subregisters, which can occur with
     415             :       // vectors of pointers.
     416       13660 :       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
     417             :           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
     418             :         Match = false;
     419             :         break;
     420             :       }
     421             :     }
     422             : 
     423        6777 :     if (Match) {
     424       13494 :       int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
     425             :                                                  AMDGPU::OpName::offset);
     426       13494 :       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
     427       13494 :       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
     428        6747 :       CI.Paired = MBBI;
     429             : 
     430        6747 :       if (CI.InstClass == DS_READ_WRITE) {
     431        2757 :         CI.Offset0 &= 0xffff;
     432        2757 :         CI.Offset1 &= 0xffff;
     433             :       } else {
     434        3990 :         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
     435        3990 :         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
     436        3990 :         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
     437        1011 :           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
     438        1011 :           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
     439             :         }
     440             :       }
     441             : 
     442             :       // Check both offsets fit in the reduced range.
     443             :       // We also need to go through the list of instructions that we plan to
     444             :       // move and make sure they are all safe to move down past the merged
     445             :       // instruction.
     446        6747 :       if (offsetsCanBeCombined(CI))
     447        6064 :         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
     448             :           return true;
     449             :     }
     450             : 
     451             :     // We've found a load/store that we couldn't merge for some reason.
     452             :     // We could potentially keep looking, but we'd need to make sure that
     453             :     // it was safe to move I and also all the instruction in InstsToMove
     454             :     // down past this instruction.
     455             :     // check if we can move I across MBBI and if we can move all I's users
     456       14984 :     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
     457       11233 :         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
     458             :         hasPhysRegDef(*MBBI))
     459             :       break;
     460             :   }
     461             :   return false;
     462             : }
     463             : 
     464             : unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
     465         761 :   if (STM->ldsRequiresM0Init())
     466         585 :     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
     467         176 :   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
     468             : }
     469             : 
     470             : unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
     471          38 :   if (STM->ldsRequiresM0Init())
     472          22 :     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
     473             : 
     474          16 :   return (EltSize == 4) ?
     475             :     AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
     476             : }
     477             : 
     478         799 : MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     479             :   CombineInfo &CI) {
     480         799 :   MachineBasicBlock *MBB = CI.I->getParent();
     481             : 
     482             :   // Be careful, since the addresses could be subregisters themselves in weird
     483             :   // cases, like vectors of pointers.
     484         799 :   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     485             : 
     486         799 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
     487         799 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
     488             : 
     489         799 :   unsigned NewOffset0 = CI.Offset0;
     490         799 :   unsigned NewOffset1 = CI.Offset1;
     491        1598 :   unsigned Opc = CI.UseST64 ?
     492         799 :     read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
     493             : 
     494         799 :   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
     495         799 :   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
     496             : 
     497         799 :   if (NewOffset0 > NewOffset1) {
     498             :     // Canonicalize the merged instruction so the smaller offset comes first.
     499             :     std::swap(NewOffset0, NewOffset1);
     500             :     std::swap(SubRegIdx0, SubRegIdx1);
     501             :   }
     502             : 
     503             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     504             :          (NewOffset0 != NewOffset1) &&
     505             :          "Computed offset doesn't fit");
     506             : 
     507         799 :   const MCInstrDesc &Read2Desc = TII->get(Opc);
     508             : 
     509             :   const TargetRegisterClass *SuperRC
     510         799 :     = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
     511         799 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     512             : 
     513             :   DebugLoc DL = CI.I->getDebugLoc();
     514             : 
     515         799 :   unsigned BaseReg = AddrReg->getReg();
     516             :   unsigned BaseRegFlags = 0;
     517         799 :   if (CI.BaseOff) {
     518          28 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     519          84 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     520          28 :       .addImm(CI.BaseOff);
     521             : 
     522          28 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     523             :     BaseRegFlags = RegState::Kill;
     524             : 
     525          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     526          28 :       .addReg(ImmReg)
     527          28 :       .addReg(AddrReg->getReg());
     528             :   }
     529             : 
     530             :   MachineInstrBuilder Read2 =
     531        1598 :     BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
     532         799 :       .addReg(BaseReg, BaseRegFlags) // addr
     533         799 :       .addImm(NewOffset0)            // offset0
     534         799 :       .addImm(NewOffset1)            // offset1
     535             :       .addImm(0)                     // gds
     536         799 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     537             : 
     538             :   (void)Read2;
     539             : 
     540         799 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     541             : 
     542             :   // Copy to the old destination registers.
     543        1598 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     544             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     545         799 :       .addReg(DestReg, 0, SubRegIdx0);
     546        1598 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     547             :                             .add(*Dest1)
     548         799 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     549             : 
     550         799 :   moveInstsAfter(Copy1, CI.InstsToMove);
     551             : 
     552             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     553         799 :   CI.I->eraseFromParent();
     554         799 :   CI.Paired->eraseFromParent();
     555             : 
     556             :   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
     557        1598 :   return Next;
     558             : }
     559             : 
     560             : unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
     561        1899 :   if (STM->ldsRequiresM0Init())
     562        1307 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
     563         592 :   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
     564             : }
     565             : 
     566             : unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
     567          28 :   if (STM->ldsRequiresM0Init())
     568          14 :     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
     569             : 
     570          14 :   return (EltSize == 4) ?
     571             :     AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
     572             : }
     573             : 
     574        1927 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     575             :   CombineInfo &CI) {
     576        1927 :   MachineBasicBlock *MBB = CI.I->getParent();
     577             : 
     578             :   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
     579             :   // sure we preserve the subregister index and any register flags set on them.
     580        1927 :   const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
     581        1927 :   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
     582             :   const MachineOperand *Data1
     583        1927 :     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
     584             : 
     585        1927 :   unsigned NewOffset0 = CI.Offset0;
     586        1927 :   unsigned NewOffset1 = CI.Offset1;
     587        3854 :   unsigned Opc = CI.UseST64 ?
     588        1927 :     write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
     589             : 
     590        1927 :   if (NewOffset0 > NewOffset1) {
     591             :     // Canonicalize the merged instruction so the smaller offset comes first.
     592             :     std::swap(NewOffset0, NewOffset1);
     593             :     std::swap(Data0, Data1);
     594             :   }
     595             : 
     596             :   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
     597             :          (NewOffset0 != NewOffset1) &&
     598             :          "Computed offset doesn't fit");
     599             : 
     600        1927 :   const MCInstrDesc &Write2Desc = TII->get(Opc);
     601             :   DebugLoc DL = CI.I->getDebugLoc();
     602             : 
     603        1927 :   unsigned BaseReg = AddrReg->getReg();
     604             :   unsigned BaseRegFlags = 0;
     605        1927 :   if (CI.BaseOff) {
     606          28 :     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     607          84 :     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
     608          28 :       .addImm(CI.BaseOff);
     609             : 
     610          28 :     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     611             :     BaseRegFlags = RegState::Kill;
     612             : 
     613          56 :     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
     614          28 :       .addReg(ImmReg)
     615          28 :       .addReg(AddrReg->getReg());
     616             :   }
     617             : 
     618             :   MachineInstrBuilder Write2 =
     619        3854 :     BuildMI(*MBB, CI.Paired, DL, Write2Desc)
     620        1927 :       .addReg(BaseReg, BaseRegFlags) // addr
     621             :       .add(*Data0)                   // data0
     622             :       .add(*Data1)                   // data1
     623        1927 :       .addImm(NewOffset0)            // offset0
     624        1927 :       .addImm(NewOffset1)            // offset1
     625             :       .addImm(0)                     // gds
     626        3854 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     627             : 
     628        1927 :   moveInstsAfter(Write2, CI.InstsToMove);
     629             : 
     630             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     631        1927 :   CI.I->eraseFromParent();
     632        1927 :   CI.Paired->eraseFromParent();
     633             : 
     634             :   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
     635        3854 :   return Next;
     636             : }
     637             : 
     638         184 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     639             :   CombineInfo &CI) {
     640         184 :   MachineBasicBlock *MBB = CI.I->getParent();
     641             :   DebugLoc DL = CI.I->getDebugLoc();
     642         184 :   unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
     643             :                               AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     644             : 
     645             :   const TargetRegisterClass *SuperRC =
     646         184 :     CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
     647         184 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     648         368 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     649             : 
     650         368 :   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
     651         368 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
     652         184 :       .addImm(MergedOffset) // offset
     653         184 :       .addImm(CI.GLC0)      // glc
     654         184 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     655             : 
     656         184 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     657         184 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     658             : 
     659             :   // Handle descending offsets
     660         184 :   if (CI.Offset0 > CI.Offset1)
     661             :     std::swap(SubRegIdx0, SubRegIdx1);
     662             : 
     663             :   // Copy to the old destination registers.
     664         184 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     665         184 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
     666         184 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
     667             : 
     668         368 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     669             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     670         184 :       .addReg(DestReg, 0, SubRegIdx0);
     671         368 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     672             :                             .add(*Dest1)
     673         184 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     674             : 
     675         184 :   moveInstsAfter(Copy1, CI.InstsToMove);
     676             : 
     677             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     678         184 :   CI.I->eraseFromParent();
     679         184 :   CI.Paired->eraseFromParent();
     680         368 :   return Next;
     681             : }
     682             : 
     683          76 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     684             :   CombineInfo &CI) {
     685          76 :   MachineBasicBlock *MBB = CI.I->getParent();
     686             :   DebugLoc DL = CI.I->getDebugLoc();
     687             :   unsigned Opcode;
     688             : 
     689          76 :   if (CI.InstClass == BUFFER_LOAD_OFFEN) {
     690          32 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
     691             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
     692             :   } else {
     693          44 :     Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
     694             :                        AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     695             :   }
     696             : 
     697             :   const TargetRegisterClass *SuperRC =
     698          76 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     699          76 :   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
     700         152 :   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
     701             : 
     702         152 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
     703             : 
     704          76 :   if (CI.InstClass == BUFFER_LOAD_OFFEN)
     705          64 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     706             : 
     707         152 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     708         152 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     709          76 :       .addImm(MergedOffset) // offset
     710          76 :       .addImm(CI.GLC0)      // glc
     711          76 :       .addImm(CI.SLC0)      // slc
     712             :       .addImm(0)            // tfe
     713          76 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     714             : 
     715          76 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     716          76 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     717             : 
     718             :   // Handle descending offsets
     719          76 :   if (CI.Offset0 > CI.Offset1)
     720             :     std::swap(SubRegIdx0, SubRegIdx1);
     721             : 
     722             :   // Copy to the old destination registers.
     723          76 :   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
     724          76 :   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     725          76 :   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     726             : 
     727         152 :   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     728             :       .add(*Dest0) // Copy to same destination including flags and sub reg.
     729          76 :       .addReg(DestReg, 0, SubRegIdx0);
     730         152 :   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
     731             :                             .add(*Dest1)
     732          76 :                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
     733             : 
     734          76 :   moveInstsAfter(Copy1, CI.InstsToMove);
     735             : 
     736             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     737          76 :   CI.I->eraseFromParent();
     738          76 :   CI.Paired->eraseFromParent();
     739         152 :   return Next;
     740             : }
     741             : 
     742      347220 : unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
     743             :   const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
     744      347220 :   IsX2 = false;
     745      347220 :   IsOffen = false;
     746             : 
     747      694440 :   switch (I.getOpcode()) {
     748        3256 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
     749        3256 :     IsOffen = true;
     750             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
     751          34 :   case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
     752          34 :     IsOffen = true;
     753             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
     754          30 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
     755          30 :     IsX2 = true;
     756          30 :     IsOffen = true;
     757             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
     758          28 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
     759          28 :     IsX2 = true;
     760          28 :     IsOffen = true;
     761             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
     762             :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
     763             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
     764          18 :   case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
     765             :     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
     766        1659 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
     767        1659 :     IsX2 = true;
     768             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
     769          16 :   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
     770          16 :     IsX2 = true;
     771             :     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
     772             :   }
     773             :   return 0;
     774             : }
     775             : 
     776          40 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     777             :   CombineInfo &CI) {
     778          40 :   MachineBasicBlock *MBB = CI.I->getParent();
     779             :   DebugLoc DL = CI.I->getDebugLoc();
     780             :   bool Unused1, Unused2;
     781          40 :   unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
     782             : 
     783          40 :   unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
     784          40 :   unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
     785             : 
     786             :   // Handle descending offsets
     787          40 :   if (CI.Offset0 > CI.Offset1)
     788             :     std::swap(SubRegIdx0, SubRegIdx1);
     789             : 
     790             :   // Copy to the new source register.
     791             :   const TargetRegisterClass *SuperRC =
     792          40 :     CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
     793          40 :   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
     794             : 
     795          80 :   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
     796          40 :   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
     797             : 
     798          80 :   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
     799             :       .add(*Src0)
     800          40 :       .addImm(SubRegIdx0)
     801             :       .add(*Src1)
     802          40 :       .addImm(SubRegIdx1);
     803             : 
     804         120 :   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
     805          40 :       .addReg(SrcReg, RegState::Kill);
     806             : 
     807          40 :   if (CI.InstClass == BUFFER_STORE_OFFEN)
     808          32 :       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
     809             : 
     810          80 :   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
     811          80 :       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
     812          80 :       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
     813          40 :       .addImm(CI.GLC0)      // glc
     814          40 :       .addImm(CI.SLC0)      // slc
     815             :       .addImm(0)            // tfe
     816          40 :       .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
     817             : 
     818          40 :   moveInstsAfter(MIB, CI.InstsToMove);
     819             : 
     820             :   MachineBasicBlock::iterator Next = std::next(CI.I);
     821          40 :   CI.I->eraseFromParent();
     822          40 :   CI.Paired->eraseFromParent();
     823          80 :   return Next;
     824             : }
     825             : 
     826             : // Scan through looking for adjacent LDS operations with constant offsets from
     827             : // the same base register. We rely on the scheduler to do the hard work of
     828             : // clustering nearby loads, and assume these are all adjacent.
     829       18484 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
     830             :   bool Modified = false;
     831             : 
     832      405715 :   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     833             :     MachineInstr &MI = *I;
     834             : 
     835             :     // Don't combine if volatile.
     836      382340 :     if (MI.hasOrderedMemoryRef()) {
     837             :       ++I;
     838       43632 :       continue;
     839             :     }
     840             : 
     841             :     CombineInfo CI;
     842      355154 :     CI.I = I;
     843      355154 :     unsigned Opc = MI.getOpcode();
     844      356682 :     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
     845      353908 :         Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
     846             : 
     847        1528 :       CI.InstClass = DS_READ_WRITE;
     848        1528 :       CI.EltSize =
     849        1528 :         (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
     850             : 
     851        1528 :       if (findMatchingInst(CI)) {
     852             :         Modified = true;
     853         799 :         I = mergeRead2Pair(CI);
     854             :       } else {
     855             :         ++I;
     856             :       }
     857             : 
     858        1528 :       continue;
     859      356000 :     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
     860      703910 :                Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
     861      351955 :                Opc == AMDGPU::DS_WRITE_B64_gfx9) {
     862        2374 :       CI.InstClass = DS_READ_WRITE;
     863             :       CI.EltSize
     864        2374 :         = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
     865             : 
     866        2374 :       if (findMatchingInst(CI)) {
     867             :         Modified = true;
     868        1927 :         I = mergeWrite2Pair(CI);
     869             :       } else {
     870             :         ++I;
     871             :       }
     872             : 
     873        2374 :       continue;
     874             :     }
     875      703088 :     if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
     876      351252 :         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
     877             :       // EltSize is in units of the offset encoding.
     878         584 :       CI.InstClass = S_BUFFER_LOAD_IMM;
     879         584 :       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
     880         584 :       CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     881         584 :       if (findMatchingInst(CI)) {
     882             :         Modified = true;
     883         184 :         I = mergeSBufferLoadImmPair(CI);
     884         184 :         if (!CI.IsX2)
     885         142 :           CreatedX2++;
     886             :       } else {
     887             :         ++I;
     888             :       }
     889         584 :       continue;
     890             :     }
     891      704824 :     if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     892      350668 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     893      697072 :         Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
     894      348536 :         Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
     895        3488 :       if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
     896             :           Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
     897        2132 :         CI.InstClass = BUFFER_LOAD_OFFEN;
     898             :       else
     899        1356 :         CI.InstClass = BUFFER_LOAD_OFFSET;
     900             : 
     901        3488 :       CI.EltSize = 4;
     902        3488 :       CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
     903        3488 :                 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
     904        3488 :       if (findMatchingInst(CI)) {
     905             :         Modified = true;
     906          76 :         I = mergeBufferLoadPair(CI);
     907          76 :         if (!CI.IsX2)
     908          41 :           CreatedX2++;
     909             :       } else {
     910             :         ++I;
     911             :       }
     912        3488 :       continue;
     913             :     }
     914             : 
     915             :     bool StoreIsX2, IsOffen;
     916      355652 :     if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
     917        8472 :       CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
     918        8472 :       CI.EltSize = 4;
     919        8472 :       CI.IsX2 = StoreIsX2;
     920        8472 :       if (findMatchingInst(CI)) {
     921             :         Modified = true;
     922          40 :         I = mergeBufferStorePair(CI);
     923          40 :         if (!CI.IsX2)
     924          30 :           CreatedX2++;
     925             :       } else {
     926             :         ++I;
     927             :       }
     928        8472 :       continue;
     929             :     }
     930             : 
     931             :     ++I;
     932             :   }
     933             : 
     934       18484 :   return Modified;
     935             : }
     936             : 
     937       16395 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
     938       16395 :   if (skipFunction(MF.getFunction()))
     939             :     return false;
     940             : 
     941       16393 :   STM = &MF.getSubtarget<SISubtarget>();
     942       16393 :   if (!STM->loadStoreOptEnabled())
     943             :     return false;
     944             : 
     945       16392 :   TII = STM->getInstrInfo();
     946       16392 :   TRI = &TII->getRegisterInfo();
     947             : 
     948       16392 :   MRI = &MF.getRegInfo();
     949       32784 :   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     950             : 
     951             :   assert(MRI->isSSA() && "Must be run on SSA");
     952             : 
     953             :   DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
     954             : 
     955             :   bool Modified = false;
     956             : 
     957       34831 :   for (MachineBasicBlock &MBB : MF) {
     958       18439 :     CreatedX2 = 0;
     959       18439 :     Modified |= optimizeBlock(MBB);
     960             : 
     961             :     // Run again to convert x2 to x4.
     962       18439 :     if (CreatedX2 >= 1)
     963          45 :       Modified |= optimizeBlock(MBB);
     964             :   }
     965             : 
     966             :   return Modified;
     967             : }

Generated by: LCOV version 1.13