doxygen/GCNNSAReassign_8cpp_source.html

//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential

/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA

/// with sequential versions where possible.

///

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "SIMachineFunctionInfo.h"

#include "SIRegisterInfo.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/CodeGen/LiveIntervals.h"

#include "llvm/CodeGen/LiveRegMatrix.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/VirtRegMap.h"

#include "llvm/InitializePasses.h"


using namespace llvm;


#define DEBUG_TYPE "amdgpu-nsa-reassign"


STATISTIC(NumNSAInstructions,

          "Number of NSA instructions with non-sequential address found");

STATISTIC(NumNSAConverted,

          "Number of NSA instructions changed to sequential");


namespace {


class GCNNSAReassign : public MachineFunctionPass {

public:

  static char ID;


  GCNNSAReassign() : MachineFunctionPass(ID) {

    initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());

  }


  bool runOnMachineFunction(MachineFunction &MF) override;


  StringRef getPassName() const override { return "GCN NSA Reassign"; }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<LiveIntervalsWrapperPass>();

    AU.addRequired<VirtRegMapWrapperLegacy>();

    AU.addRequired<LiveRegMatrixWrapperLegacy>();

    AU.setPreservesAll();

    MachineFunctionPass::getAnalysisUsage(AU);

  }


private:

  using NSA_Status = enum {

    NOT_NSA,        // Not an NSA instruction

    FIXED,          // NSA which we cannot modify

    NON_CONTIGUOUS, // NSA with non-sequential address which we can try

                    // to optimize.

    CONTIGUOUS      // NSA with all sequential address registers

  };


  const GCNSubtarget *ST;


  const MachineRegisterInfo *MRI;


  const SIRegisterInfo *TRI;


  VirtRegMap *VRM;


  LiveRegMatrix *LRM;


  LiveIntervals *LIS;


  unsigned MaxNumVGPRs;


  const MCPhysReg *CSRegs;


  NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;


  bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,

                          unsigned StartReg) const;


  bool canAssign(unsigned StartReg, unsigned NumRegs) const;


  bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;

};


} // End anonymous namespace.


INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",

                      false, false)

INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)

INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)

INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",

                    false, false)


char GCNNSAReassign::ID = 0;


char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;


bool

GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,

                                   unsigned StartReg) const {

  unsigned NumRegs = Intervals.size();


  for (unsigned N = 0; N < NumRegs; ++N)

    if (VRM->hasPhys(Intervals[N]->reg()))

      LRM->unassign(*Intervals[N]);


  for (unsigned N = 0; N < NumRegs; ++N)

    if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))

      return false;


  for (unsigned N = 0; N < NumRegs; ++N)

    LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));


  return true;

}


bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {

  for (unsigned N = 0; N < NumRegs; ++N) {

    unsigned Reg = StartReg + N;

    if (!MRI->isAllocatable(Reg))

      return false;


    for (unsigned I = 0; CSRegs[I]; ++I)

      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&

          !LRM->isPhysRegUsed(CSRegs[I]))

      return false;

  }


  return true;

}


bool

GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {

  unsigned NumRegs = Intervals.size();


  if (NumRegs > MaxNumVGPRs)

    return false;

  unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;


  for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {

    if (!canAssign(Reg, NumRegs))

      continue;


    if (tryAssignRegisters(Intervals, Reg))

      return true;

  }


  return false;

}


GCNNSAReassign::NSA_Status

GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {

  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());

  if (!Info)

    return NSA_Status::NOT_NSA;


  switch (Info->MIMGEncoding) {

  case AMDGPU::MIMGEncGfx10NSA:

  case AMDGPU::MIMGEncGfx11NSA:

    break;

  default:

    return NSA_Status::NOT_NSA;

  }


  int VAddr0Idx =

    AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);


  unsigned VgprBase = 0;

  bool NSA = false;

  for (unsigned I = 0; I < Info->VAddrOperands; ++I) {

    const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);

    Register Reg = Op.getReg();

    if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))

      return NSA_Status::FIXED;


    Register PhysReg = VRM->getPhys(Reg);


    if (!Fast) {

      if (!PhysReg)

        return NSA_Status::FIXED;


      // TODO: address the below limitation to handle GFX11 BVH instructions

      // Bail if address is not a VGPR32. That should be possible to extend the

      // optimization to work with subregs of a wider register tuples, but the

      // logic to find free registers will be much more complicated with much

      // less chances for success. That seems reasonable to assume that in most

      // cases a tuple is used because a vector variable contains different

      // parts of an address and it is either already consecutive or cannot

      // be reassigned if not. If needed it is better to rely on register

      // coalescer to process such address tuples.

      if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())

        return NSA_Status::FIXED;


      // InlineSpiller does not call LRM::assign() after an LI split leaving

      // it in an inconsistent state, so we cannot call LRM::unassign().

      // See llvm bug #48911.

      // Skip reassign if a register has originated from such split.

      // FIXME: Remove the workaround when bug #48911 is fixed.

      if (VRM->getPreSplitReg(Reg))

        return NSA_Status::FIXED;


      const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);


      if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)

        return NSA_Status::FIXED;


      for (auto U : MRI->use_nodbg_operands(Reg)) {

        if (U.isImplicit())

          return NSA_Status::FIXED;

        const MachineInstr *UseInst = U.getParent();

        if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)

          return NSA_Status::FIXED;

      }


      if (!LIS->hasInterval(Reg))

        return NSA_Status::FIXED;

    }


    if (I == 0)

      VgprBase = PhysReg;

    else if (VgprBase + I != PhysReg)

      NSA = true;

  }


  return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;

}


bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {

  ST = &MF.getSubtarget<GCNSubtarget>();

  if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())

    return false;


  MRI = &MF.getRegInfo();

  TRI = ST->getRegisterInfo();

  VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();

  LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();

  LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();


  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);

  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);

  CSRegs = MRI->getCalleeSavedRegs();


  using Candidate = std::pair<const MachineInstr*, bool>;

  SmallVector<Candidate, 32> Candidates;

  for (const MachineBasicBlock &MBB : MF) {

    for (const MachineInstr &MI : MBB) {

      switch (CheckNSA(MI)) {

      default:

        continue;

      case NSA_Status::CONTIGUOUS:

        Candidates.push_back(std::pair(&MI, true));

        break;

      case NSA_Status::NON_CONTIGUOUS:

        Candidates.push_back(std::pair(&MI, false));

        ++NumNSAInstructions;

        break;

      }

    }

  }


  bool Changed = false;

  for (auto &C : Candidates) {

    if (C.second)

      continue;


    const MachineInstr *MI = C.first;

    if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {

      // Already happen to be fixed.

      C.second = true;

      ++NumNSAConverted;

      continue;

    }


    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());

    int VAddr0Idx =

      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);


    SmallVector<LiveInterval *, 16> Intervals;

    SmallVector<MCRegister, 16> OrigRegs;

    SlotIndex MinInd, MaxInd;

    for (unsigned I = 0; I < Info->VAddrOperands; ++I) {

      const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);

      Register Reg = Op.getReg();

      LiveInterval *LI = &LIS->getInterval(Reg);

      if (llvm::is_contained(Intervals, LI)) {

        // Same register used, unable to make sequential

        Intervals.clear();

        break;

      }

      Intervals.push_back(LI);

      OrigRegs.push_back(VRM->getPhys(Reg));

      if (LI->empty()) {

        // The address input is undef, so it doesn't contribute to the relevant

        // range. Seed a reasonable index range if required.

        if (I == 0)

          MinInd = MaxInd = LIS->getInstructionIndex(*MI);

        continue;

      }

      MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();

      MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();

    }


    if (Intervals.empty())

      continue;


    LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI

                      << "\tOriginal allocation:\t";

               for (auto *LI

                    : Intervals) dbgs()

               << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);

               dbgs() << '\n');


    bool Success = scavengeRegs(Intervals);

    if (!Success) {

      LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");

      if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.

        continue;

    } else {

      // Check we did not make it worse for other instructions.

      auto *I =

          std::lower_bound(Candidates.begin(), &C, MinInd,

                           [this](const Candidate &C, SlotIndex I) {

                             return LIS->getInstructionIndex(*C.first) < I;

                           });

      for (auto *E = Candidates.end();

           Success && I != E && LIS->getInstructionIndex(*I->first) < MaxInd;

           ++I) {

        if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {

          Success = false;

          LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);

        }

      }

    }


    if (!Success) {

      for (unsigned I = 0; I < Info->VAddrOperands; ++I)

        if (VRM->hasPhys(Intervals[I]->reg()))

          LRM->unassign(*Intervals[I]);


      for (unsigned I = 0; I < Info->VAddrOperands; ++I)

        LRM->assign(*Intervals[I], OrigRegs[I]);


      continue;

    }


    C.second = true;

    ++NumNSAConverted;

    LLVM_DEBUG(

        dbgs() << "\tNew allocation:\t\t ["

               << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)

               << " : "

               << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)

               << "]\n");

    Changed = true;

  }


  return Changed;

}

MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105

Success
#define Success
Definition: AArch64Disassembler.cpp:220

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition: ARMSLSHardening.cpp:71

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition: Debug.h:106

Reassign
GCN NSA Reassign
Definition: GCNNSAReassign.cpp:100

DEBUG_TYPE
#define DEBUG_TYPE
Definition: GCNNSAReassign.cpp:29

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:112

InitializePasses.h

LiveIntervals.h

LiveRegMatrix.h

I
#define I(x, y, z)
Definition: MD5.cpp:58

MachineFunctionPass.h

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1945

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52

SIMachineFunctionInfo.h

SIRegisterInfo.h
Interface definition for SIRegisterInfo.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166

VirtRegMap.h

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75

llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::LiveInterval
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687

llvm::LiveInterval::reg
Register reg() const
Definition: LiveInterval.h:718

llvm::LiveIntervalsWrapperPass
Definition: LiveIntervals.h:524

llvm::LiveIntervals
Definition: LiveIntervals.h:55

llvm::LiveRange::empty
bool empty() const
Definition: LiveInterval.h:382

llvm::LiveRange::beginIndex
SlotIndex beginIndex() const
beginIndex - Return the lowest numbered slot covered.
Definition: LiveInterval.h:385

llvm::LiveRange::endIndex
SlotIndex endIndex() const
endNumber - return the maximum point of the range of the whole, exclusive.
Definition: LiveInterval.h:392

llvm::LiveRegMatrixWrapperLegacy
Definition: LiveRegMatrix.h:173

llvm::LiveRegMatrix
Definition: LiveRegMatrix.h:40

llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:78

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:125

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:169

llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunction
Definition: MachineFunction.h:258

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:724

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:734

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:822

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:69

llvm::MachineInstr::isCopy
bool isCopy() const
Definition: MachineInstr.h:1438

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:369

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:51

llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:390

llvm::SIMachineFunctionInfo::getOccupancy
unsigned getOccupancy() const
Definition: SIMachineFunctionInfo.h:1096

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:32

llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:81

llvm::SmallVectorBase::size
size_t size() const
Definition: SmallVector.h:78

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573

llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:610

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:413

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51

llvm::VirtRegMapWrapperLegacy
Definition: VirtRegMap.h:193

llvm::VirtRegMap
Definition: VirtRegMap.h:34

uint16_t

unsigned

false
Definition: StackSlotColoring.cpp:193

llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::M68k::MemAddrModeKind::U
@ U

llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:621

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::initializeGCNNSAReassignPass
void initializeGCNNSAReassignPass(PassRegistry &)

llvm::GCNNSAReassignID
char & GCNNSAReassignID
Definition: GCNNSAReassign.cpp:106

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903

llvm::printReg
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition: TargetRegisterInfo.cpp:107

N
#define N

llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:509