26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/IntrinsicsR600.h"
33#define DEBUG_TYPE "amdgpu-subtarget"
43 const unsigned WavesPerWorkgroup =
44 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
46 const unsigned WorkGroupsPerCU =
47 std::max(1u, (NWaves *
getEUsPerCU()) / WavesPerWorkgroup);
53 uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const {
58 LDSBytes =
alignTo(LDSBytes, Granularity);
70 auto PropsFromWGSize = [=](
unsigned WGSize)
71 -> std::tuple<const unsigned, const unsigned, unsigned> {
72 unsigned WavesPerWG =
divideCeil(WGSize, WaveSize);
74 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
81 const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
82 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
83 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
88 if (MinWavesPerCU >= MaxWavesPerCU) {
91 const unsigned WaveSlotsPerCU = WavesPerEU *
getEUsPerCU();
96 unsigned MinWavesPerCUForWGSize =
97 divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
98 if (MinWavesPerCU > MinWavesPerCUForWGSize) {
99 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
100 if (
unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
107 MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
108 MaxWavesPerWG - MinWavesPerWG);
115 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
116 if (
unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
123 MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
124 ((MaxWGSize - 1) / WaveSize) + 1 -
131 return {std::clamp(MinWavesPerCU /
getEUsPerCU(), 1U, WavesPerEU),
141std::pair<unsigned, unsigned>
159 std::pair<unsigned, unsigned>
Default =
164 F,
"amdgpu-flat-work-group-size",
Default);
167 if (Requested.first > Requested.second)
184 std::pair<unsigned, unsigned> RequestedWavesPerEU,
185 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
unsigned LDSBytes)
const {
190 std::pair<unsigned, unsigned>
Default = {
197 if (RequestedWavesPerEU.first <
Default.first ||
198 RequestedWavesPerEU.first >
Default.second ||
199 RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
204 RequestedWavesPerEU.second =
205 std::min(RequestedWavesPerEU.second,
Default.second);
206 return RequestedWavesPerEU;
209std::pair<unsigned, unsigned>
221std::pair<unsigned, unsigned>
223 unsigned LDSBytes,
const Function &
F)
const {
228 std::pair<unsigned, unsigned> Requested =
233std::optional<unsigned>
235 unsigned Dim)
const {
237 if (
Node &&
Node->getNumOperands() == 3)
243 const Function &
F,
bool RequiresUniformYZ)
const {
244 auto *
Node =
F.getMetadata(
"reqd_work_group_size");
245 if (!
Node ||
Node->getNumOperands() != 3)
254 bool Is1D = YLen <= 1 && ZLen <= 1;
255 bool IsXLargeEnough =
257 return Is1D || IsXLargeEnough;
265 unsigned Dimension)
const {
268 return *ReqdSize - 1;
273 for (
int I = 0;
I < 3; ++
I) {
280 if (!Func.hasFnAttribute(
"amdgpu-no-wwm"))
288 unsigned MinSize = 0;
290 bool IdQuery =
false;
294 const Function *
F = CI->getCalledFunction();
296 unsigned Dim = UINT_MAX;
297 switch (
F->getIntrinsicID()) {
298 case Intrinsic::amdgcn_workitem_id_x:
299 case Intrinsic::r600_read_tidig_x:
302 case Intrinsic::r600_read_local_size_x:
305 case Intrinsic::amdgcn_workitem_id_y:
306 case Intrinsic::r600_read_tidig_y:
309 case Intrinsic::r600_read_local_size_y:
312 case Intrinsic::amdgcn_workitem_id_z:
313 case Intrinsic::r600_read_tidig_z:
316 case Intrinsic::r600_read_local_size_z:
326 MinSize = MaxSize = *ReqdSize;
345 CI->addRangeRetAttr(
Range);
349 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
358 if (
F.hasFnAttribute(
"amdgpu-no-implicitarg-ptr"))
365 const Module *M =
F.getParent();
368 return F.getFnAttributeAsParsedInteger(
"amdgpu-implicitarg-num-bytes",
373 Align &MaxAlign)
const {
382 if (Arg.hasAttribute(
"amdgpu-hidden-argument"))
385 const bool IsByRef = Arg.hasByRefAttr();
386 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
387 Align Alignment =
DL.getValueOrABITypeAlignment(
388 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
389 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
390 ExplicitArgBytes =
alignTo(ExplicitArgBytes, Alignment) + AllocSize;
391 MaxAlign = std::max(MaxAlign, Alignment);
394 return ExplicitArgBytes;
398 Align &MaxAlign)
const {
407 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
409 if (ImplicitBytes != 0) {
411 TotalSize =
alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
412 MaxAlign = std::max(MaxAlign, Alignment);
441 std::numeric_limits<uint32_t>::max());
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file describes how to lower LLVM inline asm to machine code INLINEASM.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
AMDGPU R600 specific subclass of TargetSubtarget.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
AMDGPUSubtarget(const Triple &TT)
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
bool isSingleWavefrontWorkgroup(const Function &F) const
unsigned LDSAllocationGranularity
Class for arbitrary precision integers.
This class represents an incoming formal argument to a Function.
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this GlobalObject.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
A Module instance is used to store all the information related to an LLVM module.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ Default
The result value is uniform if and only if all operands are uniform.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.