doxygen/AMDGPUPreloadKernArgProlog_8cpp_source.html

//===- AMDGPUPreloadKernArgProlog.cpp - Preload KernArg Prolog ------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass creates a backward compatibility layer for kernel argument

/// preloading in situations where code is compiled with kernel argument

/// preloading enabled but executed on hardware without firmware support for it.

///

/// To avoid recompilation, the pass inserts a block at the beginning of the

/// program that loads the kernel arguments into SGPRs using s_load

/// instructions. This sets up the registers exactly as they would be on systems

/// with compatible firmware.

///

/// This effectively creates two entry points for the kernel. Firmware that

/// supports the feature will automatically jump past the first 256 bytes of the

/// program, skipping the compatibility layer and directly starting execution on

/// the optimized code path.

///

/// This pass should be run as late as possible to prevent any optimizations

/// that might assume the padding is dead code or that the added prologue is a

/// true predecessor of the kernel entry block.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUPreloadKernArgProlog.h"

#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "SIMachineFunctionInfo.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/TargetParser/TargetParser.h"


using namespace llvm;


#define DEBUG_TYPE "amdgpu-preload-kern-arg-prolog"


namespace {


// Used to build s_loads maping user SGPRs to kernel arguments

struct LoadConfig {

  unsigned Size;

  const TargetRegisterClass *RegClass;

  unsigned Opcode;

  Register LoadReg = Register();

};


class AMDGPUPreloadKernArgProlog {

public:

  AMDGPUPreloadKernArgProlog(MachineFunction &MF);


  bool run();


private:

  MachineFunction &MF;

  const GCNSubtarget &ST;

  const SIMachineFunctionInfo &MFI;

  const SIInstrInfo &TII;

  const TargetRegisterInfo &TRI;


  // Create a new block before the entry point to the kernel. Firmware that

  // supports preloading kernel arguments will automatically jump past this

  // block to the alternative kernel entry point.

  void createBackCompatBlock(unsigned NumKernArgPreloadSGPRs);


  // Add instructions to load kernel arguments into SGPRs.

  void addBackCompatLoads(MachineBasicBlock *BackCompatMBB,

                          Register KernArgSegmentPtr,

                          unsigned NumKernArgPreloadSGPRs);

};


class AMDGPUPreloadKernArgPrologLegacy : public MachineFunctionPass {

public:

  static char ID;


  AMDGPUPreloadKernArgPrologLegacy() : MachineFunctionPass(ID) {}


  StringRef getPassName() const override {

    return "AMDGPU Preload Kernel Arguments Prolog";

  }


  bool runOnMachineFunction(MachineFunction &MF) override;

};


} // end anonymous namespace


char AMDGPUPreloadKernArgPrologLegacy::ID = 0;


INITIALIZE_PASS(AMDGPUPreloadKernArgPrologLegacy, DEBUG_TYPE,

                "AMDGPU Preload Kernel Arguments Prolog", false, false)


char &llvm::AMDGPUPreloadKernArgPrologLegacyID =

    AMDGPUPreloadKernArgPrologLegacy::ID;


FunctionPass *llvm::createAMDGPUPreloadKernArgPrologLegacyPass() {

  return new AMDGPUPreloadKernArgPrologLegacy();

}


bool AMDGPUPreloadKernArgPrologLegacy::runOnMachineFunction(

    MachineFunction &MF) {

  return AMDGPUPreloadKernArgProlog(MF).run();

}


AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)

    : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),

      MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(*ST.getInstrInfo()),

      TRI(*ST.getRegisterInfo()) {}


bool AMDGPUPreloadKernArgProlog::run() {

  if (!ST.hasKernargPreload())

    return false;


  unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();

  if (!NumKernArgPreloadSGPRs)

    return false;


  createBackCompatBlock(NumKernArgPreloadSGPRs);

  return true;

}


void AMDGPUPreloadKernArgProlog::createBackCompatBlock(

    unsigned NumKernArgPreloadSGPRs) {

  auto KernelEntryMBB = MF.begin();

  MachineBasicBlock *BackCompatMBB = MF.CreateMachineBasicBlock();

  MF.insert(KernelEntryMBB, BackCompatMBB);


  assert(MFI.getUserSGPRInfo().hasKernargSegmentPtr() &&

         "Kernel argument segment pointer register not set.");

  Register KernArgSegmentPtr = MFI.getArgInfo().KernargSegmentPtr.getRegister();

  BackCompatMBB->addLiveIn(KernArgSegmentPtr);


  // Load kernel arguments to SGPRs

  addBackCompatLoads(BackCompatMBB, KernArgSegmentPtr, NumKernArgPreloadSGPRs);


  // Wait for loads to complete

  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());

  unsigned Waitcnt =

      AMDGPU::encodeWaitcnt(IV, getVmcntBitMask(IV), getExpcntBitMask(IV), 0);

  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_WAITCNT))

      .addImm(Waitcnt);


  // Branch to kernel start

  BuildMI(BackCompatMBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH))

      .addMBB(&*KernelEntryMBB);

  BackCompatMBB->addSuccessor(&*KernelEntryMBB);


  // Create a new basic block for padding to 256 bytes

  MachineBasicBlock *PadMBB = MF.CreateMachineBasicBlock();

  MF.insert(++BackCompatMBB->getIterator(), PadMBB);

  PadMBB->setAlignment(Align(256));

  PadMBB->addSuccessor(&*KernelEntryMBB);

}


/// Find the largest possible load size that fits with SGPR alignment

static LoadConfig getLoadParameters(const TargetRegisterInfo &TRI,

                                    Register KernArgPreloadSGPR,

                                    unsigned NumKernArgPreloadSGPRs) {

  static constexpr LoadConfig Configs[] = {

      {8, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM},

      {4, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM},

      {2, &AMDGPU::SReg_64RegClass, AMDGPU::S_LOAD_DWORDX2_IMM}};


  for (const auto &Config : Configs) {

    if (NumKernArgPreloadSGPRs >= Config.Size) {

      Register LoadReg = TRI.getMatchingSuperReg(KernArgPreloadSGPR,

                                                 AMDGPU::sub0, Config.RegClass);

      if (LoadReg) {

        LoadConfig C(Config);

        C.LoadReg = LoadReg;

        return C;

      }

    }

  }


  // Fallback to a single register

  return LoadConfig{1, &AMDGPU::SReg_32RegClass, AMDGPU::S_LOAD_DWORD_IMM,

                    KernArgPreloadSGPR};

}


void AMDGPUPreloadKernArgProlog::addBackCompatLoads(

    MachineBasicBlock *BackCompatMBB, Register KernArgSegmentPtr,

    unsigned NumKernArgPreloadSGPRs) {

  Register KernArgPreloadSGPR = MFI.getArgInfo().FirstKernArgPreloadReg;

  unsigned Offset = 0;

  // Fill all user SGPRs used for kernarg preloading with sequential data from

  // the kernarg segment

  while (NumKernArgPreloadSGPRs > 0) {

    LoadConfig Config =

        getLoadParameters(TRI, KernArgPreloadSGPR, NumKernArgPreloadSGPRs);


    BuildMI(BackCompatMBB, DebugLoc(), TII.get(Config.Opcode), Config.LoadReg)

        .addReg(KernArgSegmentPtr)

        .addImm(Offset)

        .addImm(0);


    Offset += 4 * Config.Size;

    KernArgPreloadSGPR = KernArgPreloadSGPR.asMCReg() + Config.Size;

    NumKernArgPreloadSGPRs -= Config.Size;

  }

}


PreservedAnalyses

AMDGPUPreloadKernArgPrologPass::run(MachineFunction &MF,

                                    MachineFunctionAnalysisManager &) {

  if (!AMDGPUPreloadKernArgProlog(MF).run())

    return PreservedAnalyses::all();


  return PreservedAnalyses::none();

}

getLoadParameters
static LoadConfig getLoadParameters(const TargetRegisterInfo &TRI, Register KernArgPreloadSGPR, unsigned NumKernArgPreloadSGPRs)
Find the largest possible load size that fits with SGPR alignment.
Definition: AMDGPUPreloadKernArgProlog.cpp:157

DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUPreloadKernArgProlog.cpp:38

AMDGPUPreloadKernArgProlog.h

AMDGPU.h

Config
RelaxConfig Config
Definition: ELF_riscv.cpp:506

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125

MachineFunctionPass.h

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1944

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

SIMachineFunctionInfo.h

TargetParser.h

IV
static const uint32_t IV[8]
Definition: blake3_impl.h:78

char

llvm::AMDGPUPreloadKernArgPrologPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM)
Definition: AMDGPUPreloadKernArgProlog.cpp:205

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:125

llvm::MachineBasicBlock::setAlignment
void setAlignment(Align A)
Set alignment of the basic block.
Definition: MachineBasicBlock.h:617

llvm::MachineBasicBlock::addSuccessor
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition: MachineBasicBlock.cpp:798

llvm::MachineBasicBlock::addLiveIn
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
Definition: MachineBasicBlock.h:456

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30

llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunction
Definition: MachineFunction.h:258

llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:938

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Definition: MachineFunction.cpp:499

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition: MachineFunction.h:957

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:133

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:99

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:148

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111

llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:110

llvm::SIInstrInfo
Definition: SIInstrInfo.h:85

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:390

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51

llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:44

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:235

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:132

unsigned

llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:229

llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:1508

llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1404

llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1422

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::Offset
@ Offset
Definition: DWP.cpp:480

llvm::AMDGPUPreloadKernArgPrologLegacyID
char & AMDGPUPreloadKernArgPrologLegacyID

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:373

llvm::createAMDGPUPreloadKernArgPrologLegacyPass
FunctionPass * createAMDGPUPreloadKernArgPrologLegacyPass()

llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:130

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39