LLVM 23.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
18#include "llvm/IR/CallingConv.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
31public:
47
48private:
49 const Triple &TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
63 unsigned FlatOffsetBitWidth = 0;
64
65public:
66 AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
67
68 static const AMDGPUSubtarget &get(const MachineFunction &MF);
69 static const AMDGPUSubtarget &get(const TargetMachine &TM,
70 const Function &F);
71
72 /// \returns Default range flat work group size for a calling convention.
73 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
74
75 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
76 /// for function \p F, or minimum/maximum flat work group sizes explicitly
77 /// requested using "amdgpu-flat-work-group-size" attribute attached to
78 /// function \p F.
79 ///
80 /// \returns Subtarget's default values if explicitly requested values cannot
81 /// be converted to integer, or violate subtarget's specifications.
82 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
83
84 /// \returns true if the maximum flat work-group size for \p F is at most the
85 /// wavefront size, so a work-group may fit in a single wavefront.
86 bool isSingleWavefrontWorkgroup(const Function &F) const;
87
88 /// \returns The required size of workgroups that will be used to execute \p F
89 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
90 /// metadata. Otherwise, returns std::nullopt.
91 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
92 unsigned Dim) const;
93
94 /// \returns true if \p F will execute in a manner that leaves the X
95 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
96 /// wavefrontsize is uniform. This is true if either the Y and Z block
97 /// dimensions are known to always be 1 or if the X dimension will always be a
98 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
99 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
100 /// wavesize64 would ordinarily pass this test, it won't with
101 /// \pRequiresUniformYZ).
102 ///
103 /// This information is currently only gathered from the !reqd_work_group_size
104 /// metadata on \p F, but this may be improved in the future.
106 bool REquiresUniformYZ = false) const;
107
108 /// \returns Subtarget's default pair of minimum/maximum number of waves per
109 /// execution unit for function \p F, or minimum/maximum number of waves per
110 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
111 /// attached to function \p F.
112 ///
113 /// \returns Subtarget's default values if explicitly requested values cannot
114 /// be converted to integer, violate subtarget's specifications, or are not
115 /// compatible with minimum/maximum number of waves limited by flat work group
116 /// size, register usage, and/or lds usage.
117 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
118
119 /// Overload which uses the specified values for the flat workgroup sizes and
120 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
121 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
122 /// LDSBytes to the per-workgroup LDS allocation.
123 std::pair<unsigned, unsigned>
124 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
125 unsigned LDSBytes, const Function &F) const;
126
127 /// Returns the target minimum/maximum number of waves per EU. This is based
128 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
129 /// limited by the maximum achievable occupancy derived from the range of \p
130 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
131 std::pair<unsigned, unsigned>
132 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
133 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
134 unsigned LDSBytes) const;
135
136 /// Return the amount of LDS that can be used that will not restrict the
137 /// occupancy lower than WaveCount.
138 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
139 const Function &) const;
140
141 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
142 /// be achieved when the only function running on a CU is \p F and each
143 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
144 /// This notably depends on the range of allowed flat group sizes for the
145 /// function and hardware characteristics.
146 std::pair<unsigned, unsigned>
150
151 /// Overload which uses the specified values for the flat work group sizes,
152 /// rather than querying the function itself. \p FlatWorkGroupSizes should
153 /// correspond to the function's value for getFlatWorkGroupSizes.
154 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
155 uint32_t LDSBytes,
156 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
157
158 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
159 /// be achieved when the only function running on a CU is \p MF. This notably
160 /// depends on the range of allowed flat group sizes for the function, the
161 /// amount of per-workgroup LDS space required by the function, and hardware
162 /// characteristics.
163 std::pair<unsigned, unsigned>
165
166 bool isAmdHsaOS() const {
167 return TargetTriple.getOS() == Triple::AMDHSA;
168 }
169
170 bool isAmdPalOS() const {
171 return TargetTriple.getOS() == Triple::AMDPAL;
172 }
173
174 bool isMesa3DOS() const {
175 return TargetTriple.getOS() == Triple::Mesa3D;
176 }
177
178 bool isMesaKernel(const Function &F) const;
179
180 bool isAmdHsaOrMesa(const Function &F) const {
181 return isAmdHsaOS() || isMesaKernel(F);
182 }
183
184 bool isGCN() const { return TargetTriple.isAMDGCN(); }
185
186 //==---------------------------------------------------------------------===//
187 // TableGen-generated feature getters.
188 //==---------------------------------------------------------------------===//
189#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
190 virtual bool GETTER() const { return false; }
191#include "AMDGPUGenSubtargetInfo.inc"
192 //==---------------------------------------------------------------------===//
193
194 /// Return true if real (non-fake) variants of True16 instructions using
195 /// 16-bit registers should be code-generated. Fake True16 instructions are
196 /// identical to non-fake ones except that they take 32-bit registers as
197 /// operands and always use their low halves.
198 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
199 // supported and the support for fake True16 instructions is removed.
200 bool useRealTrue16Insts() const {
201 return hasTrue16BitInsts() && enableRealTrue16Insts();
202 }
203
204 bool hasMulI24() const {
205 return HasMulI24;
206 }
207
208 bool hasMulU24() const {
209 return HasMulU24;
210 }
211
212 bool hasSMulHi() const {
213 return HasSMulHi;
214 }
215
216 bool hasFminFmaxLegacy() const {
217 return HasFminFmaxLegacy;
218 }
219
220 unsigned getWavefrontSize() const {
221 return 1 << WavefrontSizeLog2;
222 }
223
224 unsigned getWavefrontSizeLog2() const {
225 return WavefrontSizeLog2;
226 }
227
228 /// Return the maximum number of bytes of LDS available for all workgroups
229 /// running on the same WGP or CU.
230 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
231 /// limited to 64k.
232 unsigned getLocalMemorySize() const {
233 return LocalMemorySize;
234 }
235
236 /// Return the maximum number of bytes of LDS that can be allocated to a
237 /// single workgroup.
238 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
239 /// 128k in total.
242 }
243
244 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
245 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
246 /// CU mode into account.
247 unsigned getEUsPerCU() const { return EUsPerCU; }
248
250 return isAmdHsaOS() ? Align(8) : Align(4);
251 }
252
253 /// Returns the offset in bytes from the start of the input buffer
254 /// of the first explicit kernel argument.
255 unsigned getExplicitKernelArgOffset() const {
256 switch (TargetTriple.getOS()) {
257 case Triple::AMDHSA:
258 case Triple::AMDPAL:
259 case Triple::Mesa3D:
260 return 0;
262 default:
263 // For legacy reasons unknown/other is treated as a different version of
264 // mesa.
265 return 36;
266 }
267
268 llvm_unreachable("invalid triple OS");
269 }
270
271 /// \returns Maximum number of work groups per compute unit supported by the
272 /// subtarget and limited by given \p FlatWorkGroupSize.
273 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
274
275 /// \returns Minimum flat work group size supported by the subtarget.
276 virtual unsigned getMinFlatWorkGroupSize() const = 0;
277
278 /// \returns Maximum flat work group size supported by the subtarget.
279 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
280
281 /// \returns Number of waves per execution unit required to support the given
282 /// \p FlatWorkGroupSize.
283 virtual unsigned
284 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
285
286 /// \returns Minimum number of waves per execution unit supported by the
287 /// subtarget.
288 virtual unsigned getMinWavesPerEU() const = 0;
289
290 /// \returns Maximum number of waves per execution unit supported by the
291 /// subtarget without any kind of limitation.
292 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
293
294 /// Return the maximum workitem ID value in the function, for the given (0, 1,
295 /// 2) dimension.
296 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
297
298 /// Return true if only a single workitem can be active in a wave.
299 bool isSingleLaneExecution(const Function &Kernel) const;
300
301 /// Creates value range metadata on an workitemid.* intrinsic call or load.
303
304 /// \returns Number of bytes of arguments that are passed to a shader or
305 /// kernel in addition to the explicit ones declared for the function.
306 unsigned getImplicitArgNumBytes(const Function &F) const;
307 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
308 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
309
310 /// \returns Corresponding DWARF register number mapping flavour for the
311 /// \p WavefrontSize.
313
314 virtual ~AMDGPUSubtarget() = default;
315};
316
317} // end namespace llvm
318
319#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file defines the SmallVector class.
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
virtual ~AMDGPUSubtarget()=default
bool isAmdHsaOrMesa(const Function &F) const
AMDGPUSubtarget(const Triple &TT)
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
bool isSingleWavefrontWorkgroup(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39