/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

Bug Summary

File:	llvm/include/llvm/Support/MathExtras.h
Warning:	line 252, column 30 The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SILoadStoreOptimizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

→

1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11//  ds_read_b32 v0, v2 offset:16
12//  ds_read_b32 v1, v2 offset:32
13// ==>
14//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17//  s_buffer_load_dword s4, s[0:3], 4
18//  s_buffer_load_dword s5, s[0:3], 8
19// ==>
20//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27//  s_movk_i32 s0, 0x1800
28//  v_add_co_u32_e32 v0, vcc, s0, v2
29//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31//  s_movk_i32 s0, 0x1000
32//  v_add_co_u32_e32 v5, vcc, s0, v2
33//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34//  global_load_dwordx2 v[5:6], v[5:6], off
35//  global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37//  s_movk_i32 s0, 0x1000
38//  v_add_co_u32_e32 v5, vcc, s0, v2
39//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40//  global_load_dwordx2 v[5:6], v[5:6], off
41//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46//   the constant into the data register is placed between the stores, although
47//   this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50//   one pair, and recomputes live intervals and moves on to the next pair. It
51//   would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54//   cluster of loads have offsets that are too large to fit in the 8-bit
55//   offsets, but are close enough to fit in the 8 bits, we can add to the base
56//   pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//

60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
62#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63#include "llvm/Analysis/AliasAnalysis.h"
64#include "llvm/CodeGen/MachineFunctionPass.h"
65#include "llvm/InitializePasses.h"

67using namespace llvm;

69#define DEBUG_TYPE"si-load-store-opt" "si-load-store-opt"

71namespace {
72enum InstClassEnum {
UNKNOWN,
DS_READ,
DS_WRITE,
S_BUFFER_LOAD_IMM,
BUFFER_LOAD,
BUFFER_STORE,
MIMG,
TBUFFER_LOAD,
TBUFFER_STORE,
82};

84struct AddressRegs {
unsigned char NumVAddrs = 0;
bool SBase = false;
bool SRsrc = false;
bool SOffset = false;
bool VAddr = false;
bool Addr = false;
bool SSamp = false;
92};

94// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95const unsigned MaxAddressRegs = 12 + 1 + 1;

97class SILoadStoreOptimizer : public MachineFunctionPass {
struct CombineInfo {
  MachineBasicBlock::iterator I;
  unsigned EltSize;
  unsigned Offset;
  unsigned Width;
  unsigned Format;
  unsigned BaseOff;
  unsigned DMask;
  InstClassEnum InstClass;
  unsigned CPol = 0;
  bool UseST64;
  int AddrIdx[MaxAddressRegs];
  const MachineOperand *AddrReg[MaxAddressRegs];
  unsigned NumAddresses;
  unsigned Order;

  bool hasSameBaseAddress(const MachineInstr &MI) {
    for (unsigned i = 0; i < NumAddresses; i++) {
      const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);

      if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
        if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
            AddrReg[i]->getImm() != AddrRegNext.getImm()) {
          return false;
        }
        continue;
      }

      // Check same base pointer. Be careful of subregisters, which can occur
      // with vectors of pointers.
      if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
          AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
       return false;
      }
    }
    return true;
  }

  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
    for (unsigned i = 0; i < NumAddresses; ++i) {
      const MachineOperand *AddrOp = AddrReg[i];
      // Immediates are always OK.
      if (AddrOp->isImm())
        continue;

      // Don't try to merge addresses that aren't either immediates or registers.
      // TODO: Should be possible to merge FrameIndexes and maybe some other
      // non-register
      if (!AddrOp->isReg())
        return false;

      // TODO: We should be able to merge physical reg addreses.
      if (AddrOp->getReg().isPhysical())
        return false;

      // If an address has only one use then there will be on other
      // instructions with the same address, so we can't merge this one.
      if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
        return false;
    }
    return true;
  }

  void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
             const GCNSubtarget &STM);
};

struct BaseRegisters {
  Register LoReg;
  Register HiReg;

  unsigned LoSubReg = 0;
  unsigned HiSubReg = 0;
};

struct MemAddress {
  BaseRegisters Base;
  int64_t Offset = 0;
};

using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;

180private:
const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;

static bool dmasksCanBeCombined(const CombineInfo &CI,
                                const SIInstrInfo &TII,
                                const CombineInfo &Paired);
static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
                                 CombineInfo &Paired, bool Modify = false);
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                      const CombineInfo &Paired);
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                   const CombineInfo &Paired);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
                                                  const CombineInfo &Paired);
const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;

bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
                          SmallVectorImpl<MachineInstr *> &InstsToMove);

unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
                                           CombineInfo &Paired,
                const SmallVectorImpl<MachineInstr *> &InstsToMove);

unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
                const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
               const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
                    const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
                     const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
                     const SmallVectorImpl<MachineInstr *> &InstsToMove);
MachineBasicBlock::iterator
mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
                      const SmallVectorImpl<MachineInstr *> &InstsToMove);

void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                         int32_t NewOffset) const;
Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
/// a 13bit constant offset which gets promoted to the immediate.
bool promoteConstantOffsetToImm(MachineInstr &CI,
                                MemInfoMap &Visited,
                                SmallPtrSet<MachineInstr *, 4> &Promoted) const;
void addInstToMergeableList(const CombineInfo &CI,
                std::list<std::list<CombineInfo> > &MergeableInsts) const;

std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
    std::list<std::list<CombineInfo>> &MergeableInsts) const;

255public:
static char ID;

SILoadStoreOptimizer() : MachineFunctionPass(ID) {
  initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}

bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
                                   bool &OptimizeListAgain);
bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override { return "SI Load Store Optimizer"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
  AU.setPreservesCFG();
  AU.addRequired<AAResultsWrapperPass>();

  MachineFunctionPass::getAnalysisUsage(AU);
}

MachineFunctionProperties getRequiredProperties() const override {
  return MachineFunctionProperties()
    .set(MachineFunctionProperties::Property::IsSSA);
}
281};

283static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
const unsigned Opc = MI.getOpcode();

if (TII.isMUBUF(Opc)) {
  // FIXME: Handle d16 correctly
  return AMDGPU::getMUBUFElements(Opc);
}
if (TII.isMIMG(MI)) {
  uint64_t DMaskImm =
      TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
  return countPopulation(DMaskImm);
}
if (TII.isMTBUF(Opc)) {
  return AMDGPU::getMTBUFElements(Opc);
}

switch (Opc) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
  return 8;
case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_WRITE_B32_gfx9:
  return 1;
case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH[[gnu::fallthrough]];
case AMDGPU::DS_WRITE_B64_gfx9:
  return 2;
default:
  return 0;
}
321}

323/// Maps instruction opcode to enum InstClassEnum.
324static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
switch (Opc) {
default:
  if (TII.isMUBUF(Opc)) {
    switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
    default:
      return UNKNOWN;
    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
      return BUFFER_LOAD;
    case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
    case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
      return BUFFER_STORE;
    }
  }
  if (TII.isMIMG(Opc)) {
    // Ignore instructions encoded without vaddr.
    if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
      return UNKNOWN;
    // Ignore BVH instructions
    if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
      return UNKNOWN;
    // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
    if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
        TII.isGather4(Opc))
      return UNKNOWN;
    return MIMG;
  }
  if (TII.isMTBUF(Opc)) {
    switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
    default:
      return UNKNOWN;
    case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
    case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
    case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
    case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
      return TBUFFER_LOAD;
    case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
    case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
    case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
    case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
      return TBUFFER_STORE;
    }
  }
  return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
  return S_BUFFER_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B64_gfx9:
  return DS_READ;
case AMDGPU::DS_WRITE_B32:
case AMDGPU::DS_WRITE_B32_gfx9:
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
  return DS_WRITE;
}
390}

392/// Determines instruction subclass from opcode. Only instructions
393/// of the same subclass can be merged together.
394static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
switch (Opc) {
default:
  if (TII.isMUBUF(Opc))
    return AMDGPU::getMUBUFBaseOpcode(Opc);
  if (TII.isMIMG(Opc)) {
    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
    assert(Info)(static_cast<void> (0));
    return Info->BaseOpcode;
  }
  if (TII.isMTBUF(Opc))
    return AMDGPU::getMTBUFBaseOpcode(Opc);
  return -1;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B64_gfx9:
case AMDGPU::DS_WRITE_B32:
case AMDGPU::DS_WRITE_B32_gfx9:
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
  return Opc;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
}
422}

424static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
AddressRegs Result;

if (TII.isMUBUF(Opc)) {
  if (AMDGPU::getMUBUFHasVAddr(Opc))
    Result.VAddr = true;
  if (AMDGPU::getMUBUFHasSrsrc(Opc))
    Result.SRsrc = true;
  if (AMDGPU::getMUBUFHasSoffset(Opc))
    Result.SOffset = true;

  return Result;
}

if (TII.isMIMG(Opc)) {
  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
  if (VAddr0Idx >= 0) {
    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
    Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
  } else {
    Result.VAddr = true;
  }
  Result.SRsrc = true;
  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
    Result.SSamp = true;

  return Result;
}
if (TII.isMTBUF(Opc)) {
  if (AMDGPU::getMTBUFHasVAddr(Opc))
    Result.VAddr = true;
  if (AMDGPU::getMTBUFHasSrsrc(Opc))
    Result.SRsrc = true;
  if (AMDGPU::getMTBUFHasSoffset(Opc))
    Result.SOffset = true;

  return Result;
}

switch (Opc) {
default:
  return Result;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
  Result.SBase = true;
  return Result;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B32_gfx9:
case AMDGPU::DS_READ_B64_gfx9:
case AMDGPU::DS_WRITE_B32:
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B32_gfx9:
case AMDGPU::DS_WRITE_B64_gfx9:
  Result.Addr = true;
  return Result;
}
484}

486void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
                                            const SIInstrInfo &TII,
                                            const GCNSubtarget &STM) {
I = MI;
unsigned Opc = MI->getOpcode();
InstClass = getInstClass(Opc, TII);

if (InstClass == UNKNOWN)
  return;

switch (InstClass) {
case DS_READ:
 EltSize =
        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
                                                                        : 4;
 break;
case DS_WRITE:
  EltSize =
        (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
                                                                          : 4;
  break;
case S_BUFFER_LOAD_IMM:
  EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
  break;
default:
  EltSize = 4;
  break;
}

if (InstClass == MIMG) {
  DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
  // Offset is not considered for MIMG instructions.
  Offset = 0;
} else {
  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
  Offset = I->getOperand(OffsetIdx).getImm();
}

if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
  Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();

Width = getOpcodeWidth(*I, TII);

if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
  Offset &= 0xffff;
} else if (InstClass != MIMG) {
  CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
}

AddressRegs Regs = getRegs(Opc, TII);

NumAddresses = 0;
for (unsigned J = 0; J < Regs.NumVAddrs; J++)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
if (Regs.Addr)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
if (Regs.SBase)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
if (Regs.SRsrc)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
if (Regs.SOffset)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
if (Regs.VAddr)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
if (Regs.SSamp)
  AddrIdx[NumAddresses++] =
      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
assert(NumAddresses <= MaxAddressRegs)(static_cast<void> (0));

for (unsigned J = 0; J < NumAddresses; J++)
  AddrReg[J] = &I->getOperand(AddrIdx[J]);
563}

565} // end anonymous namespace.

567INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
 &Registry) {
                    "SI Load Store Optimizer", false, false)static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
 &Registry) {
569INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
570INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }
                  false, false)PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }

573char SILoadStoreOptimizer::ID = 0;

575char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;

577FunctionPass *llvm::createSILoadStoreOptimizerPass() {
return new SILoadStoreOptimizer();
579}

581static void moveInstsAfter(MachineBasicBlock::iterator I,
                         ArrayRef<MachineInstr *> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
++I;
for (MachineInstr *MI : InstsToMove) {
  MI->removeFromParent();
  MBB->insert(I, MI);
}
589}

591static void addDefsUsesToList(const MachineInstr &MI,
                            DenseSet<Register> &RegDefs,
                            DenseSet<Register> &PhysRegUses) {
for (const MachineOperand &Op : MI.operands()) {
  if (Op.isReg()) {
    if (Op.isDef())
      RegDefs.insert(Op.getReg());
    else if (Op.readsReg() && Op.getReg().isPhysical())
      PhysRegUses.insert(Op.getReg());
  }
}
602}

604static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
                                    MachineBasicBlock::iterator B,
                                    AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
611}

613// Add MI and its defs to the lists if MI reads one of the defs that are
614// already in the list. Returns true in that case.
615static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
                                DenseSet<Register> &PhysRegUses,
                                SmallVectorImpl<MachineInstr *> &Insts) {
for (MachineOperand &Use : MI.operands()) {
  // If one of the defs is read, then there is a use of Def between I and the
  // instruction that I will potentially be merged with. We will need to move
  // this instruction after the merged instructions.
  //
  // Similarly, if there is a def which is read by an instruction that is to
  // be moved for merging, then we need to move the def-instruction as well.
  // This can only happen for physical registers such as M0; virtual
  // registers are in SSA form.
  if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
                      (Use.isDef() && RegDefs.count(Use.getReg())) ||
                      (Use.isDef() && Use.getReg().isPhysical() &&
                       PhysRegUses.count(Use.getReg())))) {
    Insts.push_back(&MI);
    addDefsUsesToList(MI, RegDefs, PhysRegUses);
    return true;
  }
}

return false;
638}

640static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
                                  ArrayRef<MachineInstr *> InstsToMove,
                                  AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore())(static_cast<void> (0));

for (MachineInstr *InstToMove : InstsToMove) {
  if (!InstToMove->mayLoadOrStore())
    continue;
  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
    return false;
}
return true;
652}

654// This function assumes that \p A and \p B have are identical except for
655// size and offset, and they referecne adjacent memory.
656static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
                                                 const MachineMemOperand *A,
                                                 const MachineMemOperand *B) {
unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
unsigned Size = A->getSize() + B->getSize();
// This function adds the offset parameter to the existing offset for A,
// so we pass 0 here as the offset and then manually set it to the correct
// value after the call.
MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
MMO->setOffset(MinOffset);
return MMO;
667}

669bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
                                             const SIInstrInfo &TII,
                                             const CombineInfo &Paired) {
assert(CI.InstClass == MIMG)(static_cast<void> (0));

// Ignore instructions with tfe/lwe set.
const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);

if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
  return false;

// Check other optional immediate operands for equality.
unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
                              AMDGPU::OpName::unorm, AMDGPU::OpName::da,
                              AMDGPU::OpName::r128, AMDGPU::OpName::a16};

for (auto op : OperandsToMatch) {
  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
  if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
    return false;
  if (Idx != -1 &&
      CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
    return false;
}

// Check DMask for overlaps.
unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
unsigned MinMask = std::min(CI.DMask, Paired.DMask);

unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
if ((1u << AllowedBitsForMin) <= MinMask)
  return false;

return true;
704}

706static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
                                     unsigned ComponentCount,
                                     const GCNSubtarget &STI) {
if (ComponentCount > 4)
  return 0;

const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
    llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
if (!OldFormatInfo)
  return 0;

const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
    llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
                                         ComponentCount,
                                         OldFormatInfo->NumFormat, STI);

if (!NewFormatInfo)
  return 0;

assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&(static_cast<void> (0))
       NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp)(static_cast<void> (0));

return NewFormatInfo->Format;
729}

731// Return the value in the inclusive range [Lo,Hi] that is aligned to the
732// highest power of two. Note that the result is well defined for all inputs
733// including corner cases like:
734// - if Lo == Hi, return that value
735// - if Lo == 0, return 0 (even though the "- 1" below underflows
736// - if Lo > Hi, return 0 (as if the range wrapped around)
737static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
1
Calling 'maskLeadingOnes<unsigned int>'→
739}

741bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
                                              const GCNSubtarget &STI,
                                              CombineInfo &Paired,
                                              bool Modify) {
assert(CI.InstClass != MIMG)(static_cast<void> (0));

// XXX - Would the same offset be OK? Is there any reason this would happen or
// be useful?
if (CI.Offset == Paired.Offset)
  return false;

// This won't be valid if the offset isn't aligned.
if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
  return false;

if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {

  const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
      llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
  if (!Info0)
    return false;
  const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
      llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
  if (!Info1)
    return false;

  if (Info0->BitsPerComp != Info1->BitsPerComp ||
      Info0->NumFormat != Info1->NumFormat)
    return false;

  // TODO: Should be possible to support more formats, but if format loads
  // are not dword-aligned, the merged load might not be valid.
  if (Info0->BitsPerComp != 32)
    return false;

  if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
    return false;
}

uint32_t EltOffset0 = CI.Offset / CI.EltSize;
uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
CI.UseST64 = false;
CI.BaseOff = 0;

// Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
  return (EltOffset0 + CI.Width == EltOffset1 ||
          EltOffset1 + Paired.Width == EltOffset0) &&
         CI.CPol == Paired.CPol &&
         (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
}

// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
    isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
  if (Modify) {
    CI.Offset = EltOffset0 / 64;
    Paired.Offset = EltOffset1 / 64;
    CI.UseST64 = true;
  }
  return true;
}

// Check if the new offsets fit in the reduced 8-bit range.
if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
  if (Modify) {
    CI.Offset = EltOffset0;
    Paired.Offset = EltOffset1;
  }
  return true;
}

// Try to shift base address to decrease offsets.
uint32_t Min = std::min(EltOffset0, EltOffset1);
uint32_t Max = std::max(EltOffset0, EltOffset1);

const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
if (((Max - Min) & ~Mask) == 0) {
  if (Modify) {
    // From the range of values we could use for BaseOff, choose the one that
    // is aligned to the highest power of two, to maximise the chance that
    // the same offset can be reused for other load/store pairs.
    uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
    // Copy the low bits of the offsets, so that when we adjust them by
    // subtracting BaseOff they will be multiples of 64.
    BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
    CI.BaseOff = BaseOff * CI.EltSize;
    CI.Offset = (EltOffset0 - BaseOff) / 64;
    Paired.Offset = (EltOffset1 - BaseOff) / 64;
    CI.UseST64 = true;
  }
  return true;
}

if (isUInt<8>(Max - Min)) {
  if (Modify) {
    // From the range of values we could use for BaseOff, choose the one that
    // is aligned to the highest power of two, to maximise the chance that
    // the same offset can be reused for other load/store pairs.
    uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
    CI.BaseOff = BaseOff * CI.EltSize;
    CI.Offset = EltOffset0 - BaseOff;
    Paired.Offset = EltOffset1 - BaseOff;
  }
  return true;
}

return false;
850}

852bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
                                   const CombineInfo &CI,
                                   const CombineInfo &Paired) {
const unsigned Width = (CI.Width + Paired.Width);
switch (CI.InstClass) {
default:
  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
case S_BUFFER_LOAD_IMM:
  switch (Width) {
  default:
    return false;
  case 2:
  case 4:
  case 8:
    return true;
  }
}
869}

871const TargetRegisterClass *
872SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
  return TRI->getRegClassForReg(*MRI, Dst->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
  return TRI->getRegClassForReg(*MRI, Src->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
  return TRI->getRegClassForReg(*MRI, Src->getReg());
}
if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
  return TRI->getRegClassForReg(*MRI, Dst->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
  return TRI->getRegClassForReg(*MRI, Src->getReg());
}
return nullptr;
889}

891/// This function assumes that CI comes before Paired in a basic block.
892bool SILoadStoreOptimizer::checkAndPrepareMerge(
  CombineInfo &CI, CombineInfo &Paired,
  SmallVectorImpl<MachineInstr *> &InstsToMove) {

// Check both offsets (or masks for MIMG) can be combined and fit in the
// reduced range.
if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
  return false;

if (CI.InstClass != MIMG &&
    (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
  return false;

const unsigned Opc = CI.I->getOpcode();
const InstClassEnum InstClass = getInstClass(Opc, *TII);

if (InstClass == UNKNOWN) {
  return false;
}
const unsigned InstSubclass = getInstSubclass(Opc, *TII);

// Do not merge VMEM buffer instructions with "swizzled" bit set.
int Swizzled =
    AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
  return false;

DenseSet<Register> RegDefsToMove;
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);

const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
bool IsAGPR = TRI->hasAGPRs(DataRC);

MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
for (; MBBI != E; ++MBBI) {

  if (MBBI == MBBE) {
    // CombineInfo::Order is a hint on the instruction ordering within the
    // basic block. This hint suggests that CI precedes Paired, which is
    // true most of the time. However, moveInstsAfter() processing a
    // previous list may have changed this order in a situation when it
    // moves an instruction which exists in some other merge list.
    // In this case it must be dependent.
    return false;
  }

  if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
      (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
    // This is not a matching instruction, but we can keep looking as
    // long as one of these conditions are met:
    // 1. It is safe to move I down past MBBI.
    // 2. It is safe to move MBBI down past the instruction that I will
    //    be merged into.

    if (MBBI->hasUnmodeledSideEffects()) {
      // We can't re-order this instruction with respect to other memory
      // operations, so we fail both conditions mentioned above.
      return false;
    }

    if (MBBI->mayLoadOrStore() &&
        (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
         !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
      // We fail condition #1, but we may still be able to satisfy condition
      // #2.  Add this instruction to the move list and then we will check
      // if condition #2 holds once we have selected the matching instruction.
      InstsToMove.push_back(&*MBBI);
      addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
      continue;
    }

    // When we match I with another DS instruction we will be moving I down
    // to the location of the matched instruction any uses of I will need to
    // be moved down as well.
    addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
                          InstsToMove);
    continue;
  }

  // Don't merge volatiles.
  if (MBBI->hasOrderedMemoryRef())
    return false;

  int Swizzled =
      AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
  if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
    return false;

  // Handle a case like
  //   DS_WRITE_B32 addr, v, idx0
  //   w = DS_READ_B32 addr, idx0
  //   DS_WRITE_B32 addr, f(w), idx1
  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
  // merging of the two writes.
  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
                            InstsToMove))
    continue;

  if (&*MBBI == &*Paired.I) {
    if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
      return false;
    // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
    //        operands. However we are reporting that ds_write2 shall have
    //        only VGPR data so that machine copy propagation does not
    //        create an illegal instruction with a VGPR and AGPR sources.
    //        Consequenctially if we create such instruction the verifier
    //        will complain.
    if (IsAGPR && CI.InstClass == DS_WRITE)
      return false;

    // We need to go through the list of instructions that we plan to
    // move and make sure they are all safe to move down past the merged
    // instruction.
    if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {

      // Call offsetsCanBeCombined with modify = true so that the offsets are
      // correct for the new instruction.  This should return true, because
      // this function should only be called on CombineInfo objects that
      // have already been confirmed to be mergeable.
      if (CI.InstClass != MIMG)
        offsetsCanBeCombined(CI, *STM, Paired, true);
      return true;
    }
    return false;
  }

  // We've found a load/store that we couldn't merge for some reason.
  // We could potentially keep looking, but we'd need to make sure that
  // it was safe to move I and also all the instruction in InstsToMove
  // down past this instruction.
  // check if we can move I across MBBI and if we can move all I's users
  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
    break;
}
return false;
1031}

1033unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037}

1039unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;

return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
                      : AMDGPU::DS_READ2ST64_B64_gfx9;
1045}

1047MachineBasicBlock::iterator
1048SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();

// Be careful, since the addresses could be subregisters themselves in weird
// cases, like vectors of pointers.
const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);

unsigned NewOffset0 = CI.Offset;
unsigned NewOffset1 = Paired.Offset;
unsigned Opc =
    CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);

unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;

if (NewOffset0 > NewOffset1) {
  // Canonicalize the merged instruction so the smaller offset comes first.
  std::swap(NewOffset0, NewOffset1);
  std::swap(SubRegIdx0, SubRegIdx1);
}

assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&(static_cast<void> (0))
       (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")(static_cast<void> (0));

const MCInstrDesc &Read2Desc = TII->get(Opc);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);

DebugLoc DL = CI.I->getDebugLoc();

Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
      .addImm(CI.BaseOff);

  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  BaseRegFlags = RegState::Kill;

  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
      .addReg(ImmReg)
      .addReg(AddrReg->getReg(), 0, BaseSubReg)
      .addImm(0); // clamp bit
  BaseSubReg = 0;
}

MachineInstrBuilder Read2 =
    BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
        .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
        .addImm(NewOffset0)                        // offset0
        .addImm(NewOffset1)                        // offset1
        .addImm(0)                                 // gds
        .cloneMergedMemRefs({&*CI.I, &*Paired.I});

(void)Read2;

const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);

// Copy to the old destination registers.
BuildMI(*MBB, Paired.I, DL, CopyDesc)
    .add(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
                          .add(*Dest1)
                          .addReg(DestReg, RegState::Kill, SubRegIdx1);

moveInstsAfter(Copy1, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();

LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n')do { } while (false);
return Read2;
1128}

1130unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
                      : AMDGPU::DS_WRITE2_B64_gfx9;
1135}

1137unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
                        : AMDGPU::DS_WRITE2ST64_B64;

return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
                      : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144}

1146MachineBasicBlock::iterator
1147SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
                                    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();

// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
const MachineOperand *AddrReg =
    TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const MachineOperand *Data0 =
    TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
const MachineOperand *Data1 =
    TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);

unsigned NewOffset0 = CI.Offset;
unsigned NewOffset1 = Paired.Offset;
unsigned Opc =
    CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);

if (NewOffset0 > NewOffset1) {
  // Canonicalize the merged instruction so the smaller offset comes first.
  std::swap(NewOffset0, NewOffset1);
  std::swap(Data0, Data1);
}

assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&(static_cast<void> (0))
       (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")(static_cast<void> (0));

const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();

Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
      .addImm(CI.BaseOff);

  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  BaseRegFlags = RegState::Kill;

  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
      .addReg(ImmReg)
      .addReg(AddrReg->getReg(), 0, BaseSubReg)
      .addImm(0); // clamp bit
  BaseSubReg = 0;
}

MachineInstrBuilder Write2 =
    BuildMI(*MBB, Paired.I, DL, Write2Desc)
        .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
        .add(*Data0)                               // data0
        .add(*Data1)                               // data1
        .addImm(NewOffset0)                        // offset0
        .addImm(NewOffset1)                        // offset1
        .addImm(0)                                 // gds
        .cloneMergedMemRefs({&*CI.I, &*Paired.I});

moveInstsAfter(Write2, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();

LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n')do { } while (false);
return Write2;
1212}

1214MachineBasicBlock::iterator
1215SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
                         const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedDMask = CI.DMask | Paired.DMask;
unsigned DMaskIdx =
    AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);

auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
  if (I == DMaskIdx)
    MIB.addImm(MergedDMask);
  else
    MIB.add((*CI.I).getOperand(I));
}

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

unsigned SubRegIdx0, SubRegIdx1;
std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);

// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);

BuildMI(*MBB, Paired.I, DL, CopyDesc)
    .add(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
                          .add(*Dest1)
                          .addReg(DestReg, RegState::Kill, SubRegIdx1);

moveInstsAfter(Copy1, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1266}

1268MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
  CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New =
  BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
      .addImm(MergedOffset) // offset
      .addImm(CI.CPol)      // cpol
      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);

BuildMI(*MBB, Paired.I, DL, CopyDesc)
    .add(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
                          .add(*Dest1)
                          .addReg(DestReg, RegState::Kill, SubRegIdx1);

moveInstsAfter(Copy1, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1316}

1318MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
  CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

// Copy to the new source register.
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);

AddressRegs Regs = getRegs(Opcode, *TII);

if (Regs.VAddr)
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New =
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
      .addImm(MergedOffset) // offset
      .addImm(CI.CPol)      // cpol
      .addImm(0)            // tfe
      .addImm(0)            // swz
      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);

BuildMI(*MBB, Paired.I, DL, CopyDesc)
    .add(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
                          .add(*Dest1)
                          .addReg(DestReg, RegState::Kill, SubRegIdx1);

moveInstsAfter(Copy1, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1377}

1379MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
  CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

// Copy to the new source register.
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);

AddressRegs Regs = getRegs(Opcode, *TII);

if (Regs.VAddr)
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

unsigned JoinedFormat =
    getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New =
    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
        .addImm(MergedOffset) // offset
        .addImm(JoinedFormat) // format
        .addImm(CI.CPol)      // cpol
        .addImm(0)            // tfe
        .addImm(0)            // swz
        .addMemOperand(
            combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);

BuildMI(*MBB, Paired.I, DL, CopyDesc)
    .add(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
                          .add(*Dest1)
                          .addReg(DestReg, RegState::Kill, SubRegIdx1);

moveInstsAfter(Copy1, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1443}

1445MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
  CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

const unsigned Opcode = getNewOpcode(CI, Paired);

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the new source register.
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register SrcReg = MRI->createVirtualRegister(SuperRC);

const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);

BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
    .add(*Src0)
    .addImm(SubRegIdx0)
    .add(*Src1)
    .addImm(SubRegIdx1);

auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
               .addReg(SrcReg, RegState::Kill);

AddressRegs Regs = getRegs(Opcode, *TII);

if (Regs.VAddr)
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

unsigned JoinedFormat =
    getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New =
    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
        .addImm(std::min(CI.Offset, Paired.Offset)) // offset
        .addImm(JoinedFormat)                     // format
        .addImm(CI.CPol)                          // cpol
        .addImm(0)                                // tfe
        .addImm(0)                                // swz
        .addMemOperand(
            combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

moveInstsAfter(MIB, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1505}

1507unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                          const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;

switch (CI.InstClass) {
default:
  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE)(static_cast<void> (0));
  // FIXME: Handle d16 correctly
  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
                                Width);
case TBUFFER_LOAD:
case TBUFFER_STORE:
  return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
                                Width);

case UNKNOWN:
  llvm_unreachable("Unknown instruction class")__builtin_unreachable();
case S_BUFFER_LOAD_IMM:
  switch (Width) {
  default:
    return 0;
  case 2:
    return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
  case 4:
    return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
  case 8:
    return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
  }
case MIMG:
  assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&(static_cast<void> (0))
         "No overlaps")(static_cast<void> (0));
  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
1540}

1542std::pair<unsigned, unsigned>
1543SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
                                  const CombineInfo &Paired) {

assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero")(static_cast<void> (0));

bool ReverseOrder;
if (CI.InstClass == MIMG) {
  assert((static_cast<void> (0))
      (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&(static_cast<void> (0))
      "No overlaps")(static_cast<void> (0));
  ReverseOrder = CI.DMask > Paired.DMask;
} else
  ReverseOrder = CI.Offset > Paired.Offset;

unsigned Idx0;
unsigned Idx1;

if (CI.Width + Paired.Width > 4) {
  assert(CI.Width == 4 && Paired.Width == 4)(static_cast<void> (0));

  if (ReverseOrder) {
    Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
    Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
  } else {
    Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
    Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
  }
} else {
  static const unsigned Idxs[4][4] = {
      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
      {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
      {AMDGPU::sub3, 0, 0, 0},
  };

  assert(CI.Width >= 1 && CI.Width <= 3)(static_cast<void> (0));
  assert(Paired.Width >= 1 && Paired.Width <= 3)(static_cast<void> (0));

  if (ReverseOrder) {
    Idx1 = Idxs[0][Paired.Width - 1];
    Idx0 = Idxs[Paired.Width][CI.Width - 1];
  } else {
    Idx0 = Idxs[0][CI.Width - 1];
    Idx1 = Idxs[CI.Width][Paired.Width - 1];
  }
}

return std::make_pair(Idx0, Idx1);
1591}

1593const TargetRegisterClass *
1594SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
                                           const CombineInfo &Paired) {
if (CI.InstClass == S_BUFFER_LOAD_IMM) {
  switch (CI.Width + Paired.Width) {
  default:
    return nullptr;
  case 2:
    return &AMDGPU::SReg_64_XEXECRegClass;
  case 4:
    return &AMDGPU::SGPR_128RegClass;
  case 8:
    return &AMDGPU::SGPR_256RegClass;
  case 16:
    return &AMDGPU::SGPR_512RegClass;
  }
}

unsigned BitWidth = 32 * (CI.Width + Paired.Width);
return TRI->hasAGPRs(getDataRegClass(*CI.I))
           ? TRI->getAGPRClassForBitWidth(BitWidth)
           : TRI->getVGPRClassForBitWidth(BitWidth);
1615}

1617MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
  CombineInfo &CI, CombineInfo &Paired,
  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

const unsigned Opcode = getNewOpcode(CI, Paired);

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the new source register.
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register SrcReg = MRI->createVirtualRegister(SuperRC);

const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);

BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
    .add(*Src0)
    .addImm(SubRegIdx0)
    .add(*Src1)
    .addImm(SubRegIdx1);

auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
               .addReg(SrcReg, RegState::Kill);

AddressRegs Regs = getRegs(Opcode, *TII);

if (Regs.VAddr)
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));


// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));

const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

MachineInstr *New =
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
      .addImm(std::min(CI.Offset, Paired.Offset)) // offset
      .addImm(CI.CPol)      // cpol
      .addImm(0)            // tfe
      .addImm(0)            // swz
      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

moveInstsAfter(MIB, InstsToMove);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
1673}

1675MachineOperand
1676SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
APInt V(32, Val, true);
if (TII->isInlineConstant(V))
  return MachineOperand::CreateImm(Val);

Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr *Mov =
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
        TII->get(AMDGPU::S_MOV_B32), Reg)
  .addImm(Val);
(void)Mov;
LLVM_DEBUG(dbgs() << "    "; Mov->dump())do { } while (false);
return MachineOperand::CreateReg(Reg, false);
1689}

1691// Compute base address using Addr and return the final register.
1692Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
                                         const MemAddress &Addr) const {
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
DebugLoc DL = MI.getDebugLoc();

assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||(static_cast<void> (0))
        Addr.Base.LoSubReg) &&(static_cast<void> (0))
       "Expected 32-bit Base-Register-Low!!")(static_cast<void> (0));

assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||(static_cast<void> (0))
        Addr.Base.HiSubReg) &&(static_cast<void> (0))
       "Expected 32-bit Base-Register-Hi!!")(static_cast<void> (0));

LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n")do { } while (false);
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);

const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register CarryReg = MRI->createVirtualRegister(CarryRC);
Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);

Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
    .addReg(CarryReg, RegState::Define)
    .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
    .add(OffsetLo)
    .addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << "    "; LoHalf->dump();)do { } while (false);

MachineInstr *HiHalf =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
  .add(OffsetHi)
  .addReg(CarryReg, RegState::Kill)
  .addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << "    "; HiHalf->dump();)do { } while (false);

Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    .addReg(DestSub0)
    .addImm(AMDGPU::sub0)
    .addReg(DestSub1)
    .addImm(AMDGPU::sub1);
(void)FullBase;
LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";)do { } while (false);

return FullDestReg;
1747}

1749// Update base and offset with the NewBase and NewOffset in MI.
1750void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
                                             Register NewBase,
                                             int32_t NewOffset) const {
auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
Base->setReg(NewBase);
Base->setIsKill(false);
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1757}

1759Optional<int32_t>
1760SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
if (Op.isImm())
  return Op.getImm();

if (!Op.isReg())
  return None;

MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
    !Def->getOperand(1).isImm())
  return None;

return Def->getOperand(1).getImm();
1773}

1775// Analyze Base and extracts:
1776//  - 32bit base registers, subregisters
1777//  - 64bit constant offset
1778// Expecting base computation as:
1779//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1780//   %LO:vgpr_32, %c:sreg_64_xexec =
1781//       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1782//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1783//   %Base:vreg_64 =
1784//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1785void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
                                                    MemAddress &Addr) const {
if (!Base.isReg())
  return;

MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
    || Def->getNumOperands() != 5)
  return;

MachineOperand BaseLo = Def->getOperand(1);
MachineOperand BaseHi = Def->getOperand(3);
if (!BaseLo.isReg() || !BaseHi.isReg())
  return;

MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());

if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
    !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
  return;

const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);

auto Offset0P = extractConstOffset(*Src0);
if (Offset0P)
  BaseLo = *Src1;
else {
  if (!(Offset0P = extractConstOffset(*Src1)))
    return;
  BaseLo = *Src0;
}

Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);

if (Src0->isImm())
  std::swap(Src0, Src1);

if (!Src1->isImm())
  return;

uint64_t Offset1 = Src1->getImm();
BaseHi = *Src0;

Addr.Base.LoReg = BaseLo.getReg();
Addr.Base.HiReg = BaseHi.getReg();
Addr.Base.LoSubReg = BaseLo.getSubReg();
Addr.Base.HiSubReg = BaseHi.getSubReg();
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1836}

1838bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
  MachineInstr &MI,
  MemInfoMap &Visited,
  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {

if (!(MI.mayLoad() ^ MI.mayStore()))
  return false;

// TODO: Support flat and scratch.
if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
  return false;

if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL__null)
  return false;

if (AnchorList.count(&MI))
  return false;

LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump())do { } while (false);

if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
  LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";)do { } while (false);
  return false;
}

// Step1: Find the base-registers and a 64bit constant offset.
MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
MemAddress MAddr;
if (Visited.find(&MI) == Visited.end()) {
  processBaseWithConstOffset(Base, MAddr);
  Visited[&MI] = MAddr;
} else
  MAddr = Visited[&MI];

if (MAddr.Offset == 0) {
  LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"do { } while (false)
                       " constant offsets that can be promoted.\n";)do { } while (false);
  return false;
}

LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "do { } while (false)
           << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";)do { } while (false);

// Step2: Traverse through MI's basic block and find an anchor(that has the
// same base-registers) with the highest 13bit distance from MI's offset.
// E.g. (64bit loads)
// bb:
//   addr1 = &a + 4096;   load1 = load(addr1,  0)
//   addr2 = &a + 6144;   load2 = load(addr2,  0)
//   addr3 = &a + 8192;   load3 = load(addr3,  0)
//   addr4 = &a + 10240;  load4 = load(addr4,  0)
//   addr5 = &a + 12288;  load5 = load(addr5,  0)
//
// Starting from the first load, the optimization will try to find a new base
// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
// as the new-base(anchor) because of the maximum distance which can
// accomodate more intermediate bases presumeably.
//
// Step3: move (&a + 8192) above load1. Compute and promote offsets from
// (&a + 8192) for load1, load2, load4.
//   addr = &a + 8192
//   load1 = load(addr,       -4096)
//   load2 = load(addr,       -2048)
//   load3 = load(addr,       0)
//   load4 = load(addr,       2048)
//   addr5 = &a + 12288;  load5 = load(addr5,  0)
//
MachineInstr *AnchorInst = nullptr;
MemAddress AnchorAddr;
uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;

MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = MI.getIterator();
++MBBI;
const SITargetLowering *TLI =
  static_cast<const SITargetLowering *>(STM->getTargetLowering());

for ( ; MBBI != E; ++MBBI) {
  MachineInstr &MINext = *MBBI;
  // TODO: Support finding an anchor(with same base) from store addresses or
  // any other load addresses where the opcodes are different.
  if (MINext.getOpcode() != MI.getOpcode() ||
      TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
    continue;

  const MachineOperand &BaseNext =
    *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
  MemAddress MAddrNext;
  if (Visited.find(&MINext) == Visited.end()) {
    processBaseWithConstOffset(BaseNext, MAddrNext);
    Visited[&MINext] = MAddrNext;
  } else
    MAddrNext = Visited[&MINext];

  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
      MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
      MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
      MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
    continue;

  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));

  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
  TargetLoweringBase::AddrMode AM;
  AM.HasBaseReg = true;
  AM.BaseOffs = Dist;
  if (TLI->isLegalGlobalAddressingMode(AM) &&
      (uint32_t)std::abs(Dist) > MaxDist) {
    MaxDist = std::abs(Dist);

    AnchorAddr = MAddrNext;
    AnchorInst = &MINext;
  }
}

if (AnchorInst) {
  LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";do { } while (false)
             AnchorInst->dump())do { } while (false);
  LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "do { } while (false)
             <<  AnchorAddr.Offset << "\n\n")do { } while (false);

  // Instead of moving up, just re-compute anchor-instruction's base address.
  Register Base = computeBase(MI, AnchorAddr);

  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
  LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump();)do { } while (false);

  for (auto P : InstsWCommonBase) {
    TargetLoweringBase::AddrMode AM;
    AM.HasBaseReg = true;
    AM.BaseOffs = P.second - AnchorAddr.Offset;

    if (TLI->isLegalGlobalAddressingMode(AM)) {
      LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;do { } while (false)
                 dbgs() << ")"; P.first->dump())do { } while (false);
      updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
      LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump())do { } while (false);
    }
  }
  AnchorList.insert(AnchorInst);
  return true;
}

return false;
1985}

1987void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
               std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
  if (AddrList.front().InstClass == CI.InstClass &&
      AddrList.front().hasSameBaseAddress(*CI.I)) {
    AddrList.emplace_back(CI);
    return;
  }
}

// Base address not found, so add a new list.
MergeableInsts.emplace_back(1, CI);
1999}

2001std::pair<MachineBasicBlock::iterator, bool>
2002SILoadStoreOptimizer::collectMergeableInsts(
  MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
  std::list<std::list<CombineInfo>> &MergeableInsts) const {
bool Modified = false;

// Sort potential mergeable instructions into lists.  One list per base address.
unsigned Order = 0;
MachineBasicBlock::iterator BlockI = Begin;
for (; BlockI != End; ++BlockI) {
  MachineInstr &MI = *BlockI;

  // We run this before checking if an address is mergeable, because it can produce
  // better code even if the instructions aren't mergeable.
  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
    Modified = true;

  // Don't combine if volatile. We also won't be able to merge across this, so
  // break the search. We can look after this barrier for separate merges.
  if (MI.hasOrderedMemoryRef()) {
    LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI)do { } while (false);

    // Search will resume after this instruction in a separate merge list.
    ++BlockI;
    break;
  }

  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
  if (InstClass == UNKNOWN)
    continue;

  CombineInfo CI;
  CI.setMI(MI, *TII, *STM);
  CI.Order = Order++;

  if (!CI.hasMergeableAddress(*MRI))
    continue;

  LLVM_DEBUG(dbgs() << "Mergeable: " << MI)do { } while (false);

  addInstToMergeableList(CI, MergeableInsts);
}

// At this point we have lists of Mergeable instructions.
//
// Part 2: Sort lists by offset and then for each CombineInfo object in the
// list try to find an instruction that can be merged with I.  If an instruction
// is found, it is stored in the Paired field.  If no instructions are found, then
// the CombineInfo object is deleted from the list.

for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
                                                 E = MergeableInsts.end(); I != E;) {

  std::list<CombineInfo> &MergeList = *I;
  if (MergeList.size() <= 1) {
    // This means we have found only one instruction with a given address
    // that can be merged, and we need at least 2 instructions to do a merge,
    // so this list can be discarded.
    I = MergeableInsts.erase(I);
    continue;
  }

  // Sort the lists by offsets, this way mergeable instructions will be
  // adjacent to each other in the list, which will make it easier to find
  // matches.
  MergeList.sort(
      [] (const CombineInfo &A, CombineInfo &B) {
        return A.Offset < B.Offset;
      });
  ++I;
}

return std::make_pair(BlockI, Modified);
2075}

2077// Scan through looking for adjacent LDS operations with constant offsets from
2078// the same base register. We rely on the scheduler to do the hard work of
2079// clustering nearby loads, and assume these are all adjacent.
2080bool SILoadStoreOptimizer::optimizeBlock(
                     std::list<std::list<CombineInfo> > &MergeableInsts) {
bool Modified = false;

for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
                                                 E = MergeableInsts.end(); I != E;) {
  std::list<CombineInfo> &MergeList = *I;

  bool OptimizeListAgain = false;
  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
    // We weren't able to make any changes, so delete the list so we don't
    // process the same instructions the next time we try to optimize this
    // block.
    I = MergeableInsts.erase(I);
    continue;
  }

  Modified = true;

  // We made changes, but also determined that there were no more optimization
  // opportunities, so we don't need to reprocess the list
  if (!OptimizeListAgain) {
    I = MergeableInsts.erase(I);
    continue;
  }
  OptimizeAgain = true;
}
return Modified;
2108}

2110bool
2111SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
                                        std::list<CombineInfo> &MergeList,
                                        bool &OptimizeListAgain) {
if (MergeList.empty())
  return false;

bool Modified = false;

for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
     Next = std::next(I)) {

  auto First = I;
  auto Second = Next;

  if ((*First).Order > (*Second).Order)
    std::swap(First, Second);
  CombineInfo &CI = *First;
  CombineInfo &Paired = *Second;

  SmallVector<MachineInstr *, 8> InstsToMove;
  if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
    ++I;
    continue;
  }

  Modified = true;

  LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I)do { } while (false);

  switch (CI.InstClass) {
  default:
    llvm_unreachable("unknown InstClass")__builtin_unreachable();
    break;
  case DS_READ: {
    MachineBasicBlock::iterator NewMI =
        mergeRead2Pair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    break;
  }
  case DS_WRITE: {
    MachineBasicBlock::iterator NewMI =
        mergeWrite2Pair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    break;
  }
  case S_BUFFER_LOAD_IMM: {
    MachineBasicBlock::iterator NewMI =
        mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
    break;
  }
  case BUFFER_LOAD: {
    MachineBasicBlock::iterator NewMI =
        mergeBufferLoadPair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
    break;
  }
  case BUFFER_STORE: {
    MachineBasicBlock::iterator NewMI =
        mergeBufferStorePair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
    break;
  }
  case MIMG: {
    MachineBasicBlock::iterator NewMI =
        mergeImagePair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
    break;
  }
  case TBUFFER_LOAD: {
    MachineBasicBlock::iterator NewMI =
        mergeTBufferLoadPair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
    break;
  }
  case TBUFFER_STORE: {
    MachineBasicBlock::iterator NewMI =
        mergeTBufferStorePair(CI, Paired, InstsToMove);
    CI.setMI(NewMI, *TII, *STM);
    OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
    break;
  }
  }
  CI.Order = Paired.Order;
  if (I == Second)
    I = Next;

  MergeList.erase(Second);
}

return Modified;
2207}

2209bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
  return false;

STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
  return false;

TII = STM->getInstrInfo();
TRI = &TII->getRegisterInfo();

MRI = &MF.getRegInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n")do { } while (false);

bool Modified = false;

// Contains the list of instructions for which constant offsets are being
// promoted to the IMM. This is tracked for an entire block at time.
SmallPtrSet<MachineInstr *, 4> AnchorList;
MemInfoMap Visited;

for (MachineBasicBlock &MBB : MF) {
  MachineBasicBlock::iterator SectionEnd;
  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
       I = SectionEnd) {
    bool CollectModified;
    std::list<std::list<CombineInfo>> MergeableInsts;

    // First pass: Collect list of all instructions we know how to merge in a
    // subset of the block.
    std::tie(SectionEnd, CollectModified) =
        collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);

    Modified |= CollectModified;

    do {
      OptimizeAgain = false;
      Modified |= optimizeBlock(MergeableInsts);
    } while (OptimizeAgain);
  }

  Visited.clear();
  AnchorList.clear();
}

return Modified;
2257}

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12 
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15 
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24 
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28 
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40 
41namespace llvm {
42 
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45  /// The returned value is undefined.
46  ZB_Undefined,
47  /// The returned value is numeric_limits<T>::max()
48  ZB_Max,
49  /// The returned value is numeric_limits<T>::digits
50  ZB_Width
51};
52 
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
71                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76                log2ef      = 1.44269504F, // (0x1.715476P+0)
77                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
78                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
84                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
86                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88 
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91  static unsigned count(T Val, ZeroBehavior) {
92    if (!Val)
93      return std::numeric_limits<T>::digits;
94    if (Val & 0x1)
95      return 0;
96 
97    // Bisection method.
98    unsigned ZeroBits = 0;
99    T Shift = std::numeric_limits<T>::digits >> 1;
100    T Mask = std::numeric_limits<T>::max() >> Shift;
101    while (Shift) {
102      if ((Val & Mask) == 0) {
103        Val >>= Shift;
104        ZeroBits |= Shift;
105      }
106      Shift >>= 1;
107      Mask >>= Shift;
108    }
109    return ZeroBits;
110  }
111};
112 
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115  static unsigned count(T Val, ZeroBehavior ZB) {
116    if (ZB != ZB_Undefined && Val == 0)
117      return 32;
118 
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120    return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122    unsigned long Index;
123    _BitScanForward(&Index, Val);
124    return Index;
125#endif
126  }
127};
128 
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131  static unsigned count(T Val, ZeroBehavior ZB) {
132    if (ZB != ZB_Undefined && Val == 0)
133      return 64;
134 
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136    return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138    unsigned long Index;
139    _BitScanForward64(&Index, Val);
140    return Index;
141#endif
142  }
143};
144#endif
145#endif
146} // namespace detail
147 
148/// Count number of 0's from the least significant bit to the most
149///   stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154///   valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157  static_assert(std::numeric_limits<T>::is_integer &&
158                    !std::numeric_limits<T>::is_signed,
159                "Only unsigned integral types are allowed.");
160  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162 
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165  static unsigned count(T Val, ZeroBehavior) {
166    if (!Val)
167      return std::numeric_limits<T>::digits;
168 
169    // Bisection method.
170    unsigned ZeroBits = 0;
171    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172      T Tmp = Val >> Shift;
173      if (Tmp)
174        Val = Tmp;
175      else
176        ZeroBits |= Shift;
177    }
178    return ZeroBits;
179  }
180};
181 
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184  static unsigned count(T Val, ZeroBehavior ZB) {
185    if (ZB != ZB_Undefined && Val == 0)
186      return 32;
187 
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189    return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191    unsigned long Index;
192    _BitScanReverse(&Index, Val);
193    return Index ^ 31;
194#endif
195  }
196};
197 
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200  static unsigned count(T Val, ZeroBehavior ZB) {
201    if (ZB != ZB_Undefined && Val == 0)
202      return 64;
203 
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205    return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207    unsigned long Index;
208    _BitScanReverse64(&Index, Val);
209    return Index ^ 63;
210#endif
211  }
212};
213#endif
214#endif
215} // namespace detail
216 
217/// Count number of 0's from the most significant bit to the least
218///   stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223///   valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226  static_assert(std::numeric_limits<T>::is_integer &&
227                    !std::numeric_limits<T>::is_signed,
228                "Only unsigned integral types are allowed.");
229  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231 
232/// Get the index of the first set bit starting from the least
233///   significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238///   valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240  if (ZB == ZB_Max && Val == 0)
241    return std::numeric_limits<T>::max();
242 
243  return countTrailingZeros(Val, ZB_Undefined);
244}
245 
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0.  Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249  static_assert(std::is_unsigned<T>::value, "Invalid type!");
250  const unsigned Bits = CHAR_BIT8 * sizeof(T);
251  assert(N <= Bits && "Invalid bit index")(static_cast<void> (0));
252  return N2.1
'N' is not equal to 0
2.1
'N' is not equal to 0
 == 0 ? 0 : (T(-1) >> (Bits - N));
3
←
'?' condition is false→
4
←
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
253}
254 
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0.  Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258  return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
2
←
Calling 'maskTrailingOnes<unsigned int>'→
259}
260 
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1.  Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264  return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266 
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1.  Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270  return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272 
273/// Get the index of the last set bit starting from the least
274///   significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279///   valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281  if (ZB == ZB_Max && Val == 0)
282    return std::numeric_limits<T>::max();
283 
284  // Use ^ instead of - because both gcc and llvm can remove the associated ^
285  // in the __builtin_clz intrinsic on x86.
286  return countLeadingZeros(Val, ZB_Undefined) ^
287         (std::numeric_limits<T>::digits - 1);
288}
289 
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297  R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302 
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306  unsigned char in[sizeof(Val)];
307  unsigned char out[sizeof(Val)];
308  std::memcpy(in, &Val, sizeof(Val));
309  for (unsigned i = 0; i < sizeof(Val); ++i)
310    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311  std::memcpy(&Val, out, sizeof(Val));
312  return Val;
313}
314 
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318  return __builtin_bitreverse8(Val);
319}
320#endif
321 
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325  return __builtin_bitreverse16(Val);
326}
327#endif
328 
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332  return __builtin_bitreverse32(Val);
333}
334#endif
335 
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339  return __builtin_bitreverse64(Val);
340}
341#endif
342 
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346 
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349  return static_cast<uint32_t>(Value >> 32);
350}
351 
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354  return static_cast<uint32_t>(Value);
355}
356 
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359  return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361 
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364  return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368  return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371  return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374  return static_cast<int32_t>(x) == x;
375}
376 
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380  static_assert(
381      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383  return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385 
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390///   return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396  static_assert(N > 0, "isUInt<0> doesn't make sense");
397  return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401  return true;
402}
403 
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406  return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409  return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412  return static_cast<uint32_t>(x) == x;
413}
414 
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418  static_assert(
419      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420  static_assert(N + S <= 64,
421                "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422  // Per the two static_asserts above, S must be strictly less than 64.  So
423  // 1 << S is not undefined behavior.
424  return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426 
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
430 
431  // uint64_t(1) << 64 is undefined behavior, so we can't do
432  //   (uint64_t(1) << N) - 1
433  // without checking first that N != 64.  But this works and doesn't have a
434  // branch.
435  return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437 
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
441 
442  return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444 
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
448 
449  // This relies on two's complement wraparound when N == 64, so we convert to
450  // int64_t only at the very end to avoid UB.
451  return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453 
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456  return N >= 64 || x <= maxUIntN(N);
457}
458 
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463 
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468  return Value && ((Value + 1) & Value) == 0;
469}
470 
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474  return Value && ((Value + 1) & Value) == 0;
475}
476 
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480  return Value && isMask_32((Value - 1) | Value);
481}
482 
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486  return Value && isMask_64((Value - 1) | Value);
487}
488 
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492  return Value && !(Value & (Value - 1));
493}
494 
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497  return Value && !(Value & (Value - 1));
498}
499 
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510  static_assert(std::numeric_limits<T>::is_integer &&
511                    !std::numeric_limits<T>::is_signed,
512                "Only unsigned integral types are allowed.");
513  return countLeadingZeros<T>(~Value, ZB);
514}
515 
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526  static_assert(std::numeric_limits<T>::is_integer &&
527                    !std::numeric_limits<T>::is_signed,
528                "Only unsigned integral types are allowed.");
529  return countTrailingZeros<T>(~Value, ZB);
530}
531 
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534  static unsigned count(T Value) {
535    // Generic version, forward to 32 bits.
536    static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538    return __builtin_popcount(Value);
539#else
540    uint32_t v = Value;
541    v = v - ((v >> 1) & 0x55555555);
542    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545  }
546};
547 
548template <typename T> struct PopulationCounter<T, 8> {
549  static unsigned count(T Value) {
550#if defined(__GNUC__4)
551    return __builtin_popcountll(Value);
552#else
553    uint64_t v = Value;
554    v = v - ((v >> 1) & 0x5555555555555555ULL);
555    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559  }
560};
561} // namespace detail
562 
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568  static_assert(std::numeric_limits<T>::is_integer &&
569                    !std::numeric_limits<T>::is_signed,
570                "Only unsigned integral types are allowed.");
571  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573 
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577  static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578                "Value is not a valid power of 2");
579  return 1 + CTLog2<kValue / 2>();
580}
581 
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583 
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587  return __builtin_log(Value) / __builtin_log(2.0);
588#else
589  return log2(Value);
590#endif
591}
592 
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597  return 31 - countLeadingZeros(Value);
598}
599 
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603  return 63 - countLeadingZeros(Value);
604}
605 
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610  return 32 - countLeadingZeros(Value - 1);
611}
612 
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616  return 64 - countLeadingZeros(Value - 1);
617}
618 
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622  while (B) {
623    T Tmp = B;
624    B = A % B;
625    A = Tmp;
626  }
627  return A;
628}
629 
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631  return greatestCommonDivisor<uint64_t>(A, B);
632}
633 
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636  double D;
637  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638  memcpy(&D, &Bits, sizeof(Bits));
639  return D;
640}
641 
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644  float F;
645  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646  memcpy(&F, &Bits, sizeof(Bits));
647  return F;
648}
649 
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654  uint64_t Bits;
655  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656  memcpy(&Bits, &Double, sizeof(Double));
657  return Bits;
658}
659 
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664  uint32_t Bits;
665  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666  memcpy(&Bits, &Float, sizeof(Float));
667  return Bits;
668}
669 
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673  // The largest power of 2 that divides both A and B.
674  //
675  // Replace "-Value" by "1+~Value" in the following commented code to avoid
676  // MSVC warning C4146
677  //    return (A | B) & -(A | B);
678  return (A | B) & (1 + ~(A | B));
679}
680 
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684  A |= (A >> 1);
685  A |= (A >> 2);
686  A |= (A >> 4);
687  A |= (A >> 8);
688  A |= (A >> 16);
689  A |= (A >> 32);
690  return A + 1;
691}
692 
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696  if (!A) return 0;
697  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699 
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703  if (!A)
704    return 0;
705  return NextPowerOf2(A - 1);
706}
707 
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718///   alignTo(5, 8) = 8
719///   alignTo(17, 8) = 24
720///   alignTo(~0LL, 8) = 0
721///   alignTo(321, 255) = 510
722///
723///   alignTo(5, 8, 7) = 7
724///   alignTo(17, 8, 1) = 17
725///   alignTo(~0LL, 8, 3) = 3
726///   alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729  assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
730  Skew %= Align;
731  return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733 
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737  static_assert(Align != 0u, "Align must be non-zero");
738  return (Value + Align - 1) / Align * Align;
739}
740 
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743  return alignTo(Numerator, Denominator) / Denominator;
744}
745 
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748  return (Numerator + (Denominator / 2)) / Denominator;
749}
750 
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754  assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
755  Skew %= Align;
756  return (Value - Skew) / Align * Align + Skew;
757}
758 
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762  static_assert(B > 0, "Bit width can't be 0.");
763  static_assert(B <= 32, "Bit width out of range.");
764  return int32_t(X << (32 - B)) >> (32 - B);
765}
766 
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770  assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
771  assert(B <= 32 && "Bit width out of range.")(static_cast<void> (0));
772  return int32_t(X << (32 - B)) >> (32 - B);
773}
774 
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778  static_assert(B > 0, "Bit width can't be 0.");
779  static_assert(B <= 64, "Bit width out of range.");
780  return int64_t(x << (64 - B)) >> (64 - B);
781}
782 
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786  assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
787  assert(B <= 64 && "Bit width out of range.")(static_cast<void> (0));
788  return int64_t(X << (64 - B)) >> (64 - B);
789}
790 
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795  return X > Y ? (X - Y) : (Y - X);
796}
797 
798/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
799/// maximum representable value of T on overflow.  ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804  bool Dummy;
805  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806  // Hacker's Delight, p. 29
807  T Z = X + Y;
808  Overflowed = (Z < X || Z < Y);
809  if (Overflowed)
810    return std::numeric_limits<T>::max();
811  else
812    return Z;
813}
814 
815/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
816/// maximum representable value of T on overflow.  ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821  bool Dummy;
822  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823 
824  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825  // because it fails for uint16_t (where multiplication can have undefined
826  // behavior due to promotion to int), and requires a division in addition
827  // to the multiplication.
828 
829  Overflowed = false;
830 
831  // Log2(Z) would be either Log2Z or Log2Z + 1.
832  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833  // will necessarily be less than Log2Max as desired.
834  int Log2Z = Log2_64(X) + Log2_64(Y);
835  const T Max = std::numeric_limits<T>::max();
836  int Log2Max = Log2_64(Max);
837  if (Log2Z < Log2Max) {
838    return X * Y;
839  }
840  if (Log2Z > Log2Max) {
841    Overflowed = true;
842    return Max;
843  }
844 
845  // We're going to use the top bit, and maybe overflow one
846  // bit past it. Multiply all but the bottom bit then add
847  // that on at the end.
848  T Z = (X >> 1) * Y;
849  if (Z & ~(Max >> 1)) {
850    Overflowed = true;
851    return Max;
852  }
853  Z <<= 1;
854  if (X & 1)
855    return SaturatingAdd(Z, Y, ResultOverflowed);
856 
857  return Z;
858}
859 
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867  bool Dummy;
868  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869 
870  T Product = SaturatingMultiply(X, Y, &Overflowed);
871  if (Overflowed)
872    return Product;
873 
874  return SaturatingAdd(A, Product, &Overflowed);
875}
876 
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879 
880 
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886  return __builtin_add_overflow(X, Y, &Result);
887#else
888  // Perform the unsigned addition.
889  using U = std::make_unsigned_t<T>;
890  const U UX = static_cast<U>(X);
891  const U UY = static_cast<U>(Y);
892  const U UResult = UX + UY;
893 
894  // Convert to signed.
895  Result = static_cast<T>(UResult);
896 
897  // Adding two positive numbers should result in a positive number.
898  if (X > 0 && Y > 0)
899    return Result <= 0;
900  // Adding two negatives should result in a negative number.
901  if (X < 0 && Y < 0)
902    return Result >= 0;
903  return false;
904#endif
905}
906 
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912  return __builtin_sub_overflow(X, Y, &Result);
913#else
914  // Perform the unsigned addition.
915  using U = std::make_unsigned_t<T>;
916  const U UX = static_cast<U>(X);
917  const U UY = static_cast<U>(Y);
918  const U UResult = UX - UY;
919 
920  // Convert to signed.
921  Result = static_cast<T>(UResult);
922 
923  // Subtracting a positive number from a negative results in a negative number.
924  if (X <= 0 && Y > 0)
925    return Result >= 0;
926  // Subtracting a negative number from a positive results in a positive number.
927  if (X >= 0 && Y < 0)
928    return Result <= 0;
929  return false;
930#endif
931}
932 
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937  // Perform the unsigned multiplication on absolute values.
938  using U = std::make_unsigned_t<T>;
939  const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940  const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941  const U UResult = UX * UY;
942 
943  // Convert to signed.
944  const bool IsNegative = (X < 0) ^ (Y < 0);
945  Result = IsNegative ? (0 - UResult) : UResult;
946 
947  // If any of the args was 0, result is 0 and no overflow occurs.
948  if (UX == 0 || UY == 0)
949    return false;
950 
951  // UX and UY are in [1, 2^n], where n is the number of digits.
952  // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953  // positive) divided by an argument compares to the other.
954  if (IsNegative)
955    return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956  else
957    return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959 
960} // End llvm namespace
961 
962#endif