LLVM 20.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/IR/CallingConv.h"
20
21namespace llvm {
22
24class Function;
25class Instruction;
26class MachineFunction;
27class TargetMachine;
28
30public:
33 R600 = 1,
34 R700 = 2,
40 GFX9 = 8,
41 GFX10 = 9,
42 GFX11 = 10,
43 GFX12 = 11,
44 };
45
46private:
47 Triple TargetTriple;
48
49protected:
50 bool GCN3Encoding = false;
51 bool Has16BitInsts = false;
52 bool HasTrue16BitInsts = false;
58 bool HasCvtPkF16F32Inst = false;
62 bool HasMadMixInsts = false;
63 bool HasMadMacF32Insts = false;
64 bool HasDsSrc2Insts = false;
65 bool HasSDWA = false;
66 bool HasVOP3PInsts = false;
67 bool HasMulI24 = true;
68 bool HasMulU24 = true;
69 bool HasSMulHi = false;
70 bool HasInv2PiInlineImm = false;
71 bool HasFminFmaxLegacy = true;
72 bool EnablePromoteAlloca = false;
73 bool HasTrigReducedRange = false;
74 bool FastFMAF32 = false;
75 unsigned EUsPerCU = 4;
76 unsigned MaxWavesPerEU = 10;
77 unsigned LocalMemorySize = 0;
80
81public:
83
84 static const AMDGPUSubtarget &get(const MachineFunction &MF);
85 static const AMDGPUSubtarget &get(const TargetMachine &TM,
86 const Function &F);
87
88 /// \returns Default range flat work group size for a calling convention.
89 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
90
91 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
92 /// for function \p F, or minimum/maximum flat work group sizes explicitly
93 /// requested using "amdgpu-flat-work-group-size" attribute attached to
94 /// function \p F.
95 ///
96 /// \returns Subtarget's default values if explicitly requested values cannot
97 /// be converted to integer, or violate subtarget's specifications.
98 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
99
100 /// \returns Subtarget's default pair of minimum/maximum number of waves per
101 /// execution unit for function \p F, or minimum/maximum number of waves per
102 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
103 /// attached to function \p F.
104 ///
105 /// \returns Subtarget's default values if explicitly requested values cannot
106 /// be converted to integer, violate subtarget's specifications, or are not
107 /// compatible with minimum/maximum number of waves limited by flat work group
108 /// size, register usage, and/or lds usage.
109 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
110 // Default/requested minimum/maximum flat work group sizes.
111 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
112 return getWavesPerEU(F, FlatWorkGroupSizes);
113 }
114
115 /// Overload which uses the specified values for the flat work group sizes,
116 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
117 /// correspond to the function's value for getFlatWorkGroupSizes.
118 std::pair<unsigned, unsigned>
119 getWavesPerEU(const Function &F,
120 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
121 std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
122 std::pair<unsigned, unsigned> WavesPerEU,
123 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
124
125 /// Return the amount of LDS that can be used that will not restrict the
126 /// occupancy lower than WaveCount.
127 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
128 const Function &) const;
129
130 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
131 /// be achieved when the only function running on a CU is \p F and each
132 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
133 /// This notably depends on the range of allowed flat group sizes for the
134 /// function and hardware characteristics.
135 std::pair<unsigned, unsigned>
136 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
137
138 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
139 /// be achieved when the only function running on a CU is \p MF. This notably
140 /// depends on the range of allowed flat group sizes for the function, the
141 /// amount of per-workgroup LDS space required by the function, and hardware
142 /// characteristics.
143 std::pair<unsigned, unsigned>
145
146 bool isAmdHsaOS() const {
147 return TargetTriple.getOS() == Triple::AMDHSA;
148 }
149
150 bool isAmdPalOS() const {
151 return TargetTriple.getOS() == Triple::AMDPAL;
152 }
153
154 bool isMesa3DOS() const {
155 return TargetTriple.getOS() == Triple::Mesa3D;
156 }
157
158 bool isMesaKernel(const Function &F) const;
159
160 bool isAmdHsaOrMesa(const Function &F) const {
161 return isAmdHsaOS() || isMesaKernel(F);
162 }
163
164 bool isGCN() const {
165 return TargetTriple.getArch() == Triple::amdgcn;
166 }
167
168 bool isGCN3Encoding() const {
169 return GCN3Encoding;
170 }
171
172 bool has16BitInsts() const {
173 return Has16BitInsts;
174 }
175
176 /// Return true if the subtarget supports True16 instructions.
177 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
178
179 /// Return true if real (non-fake) variants of True16 instructions using
180 /// 16-bit registers should be code-generated. Fake True16 instructions are
181 /// identical to non-fake ones except that they take 32-bit registers as
182 /// operands and always use their low halves.
183 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
184 // supported and the support for fake True16 instructions is removed.
185 bool useRealTrue16Insts() const;
186
189 }
190
191 bool hasMadMixInsts() const {
192 return HasMadMixInsts;
193 }
194
196
198
200
202
204
205 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
206
209 }
210
211 bool hasMadMacF32Insts() const {
212 return HasMadMacF32Insts || !isGCN();
213 }
214
215 bool hasDsSrc2Insts() const {
216 return HasDsSrc2Insts;
217 }
218
219 bool hasSDWA() const {
220 return HasSDWA;
221 }
222
223 bool hasVOP3PInsts() const {
224 return HasVOP3PInsts;
225 }
226
227 bool hasMulI24() const {
228 return HasMulI24;
229 }
230
231 bool hasMulU24() const {
232 return HasMulU24;
233 }
234
235 bool hasSMulHi() const {
236 return HasSMulHi;
237 }
238
239 bool hasInv2PiInlineImm() const {
240 return HasInv2PiInlineImm;
241 }
242
243 bool hasFminFmaxLegacy() const {
244 return HasFminFmaxLegacy;
245 }
246
247 bool hasTrigReducedRange() const {
248 return HasTrigReducedRange;
249 }
250
251 bool hasFastFMAF32() const {
252 return FastFMAF32;
253 }
254
256 return EnablePromoteAlloca;
257 }
258
259 unsigned getWavefrontSize() const {
260 return 1 << WavefrontSizeLog2;
261 }
262
263 unsigned getWavefrontSizeLog2() const {
264 return WavefrontSizeLog2;
265 }
266
267 /// Return the maximum number of bytes of LDS available for all workgroups
268 /// running on the same WGP or CU.
269 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
270 /// limited to 64k.
271 unsigned getLocalMemorySize() const {
272 return LocalMemorySize;
273 }
274
275 /// Return the maximum number of bytes of LDS that can be allocated to a
276 /// single workgroup.
277 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
278 /// 128k in total.
281 }
282
283 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
284 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
285 /// CU mode into account.
286 unsigned getEUsPerCU() const { return EUsPerCU; }
287
289 return isAmdHsaOS() ? Align(8) : Align(4);
290 }
291
292 /// Returns the offset in bytes from the start of the input buffer
293 /// of the first explicit kernel argument.
294 unsigned getExplicitKernelArgOffset() const {
295 switch (TargetTriple.getOS()) {
296 case Triple::AMDHSA:
297 case Triple::AMDPAL:
298 case Triple::Mesa3D:
299 return 0;
301 default:
302 // For legacy reasons unknown/other is treated as a different version of
303 // mesa.
304 return 36;
305 }
306
307 llvm_unreachable("invalid triple OS");
308 }
309
310 /// \returns Maximum number of work groups per compute unit supported by the
311 /// subtarget and limited by given \p FlatWorkGroupSize.
312 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
313
314 /// \returns Minimum flat work group size supported by the subtarget.
315 virtual unsigned getMinFlatWorkGroupSize() const = 0;
316
317 /// \returns Maximum flat work group size supported by the subtarget.
318 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
319
320 /// \returns Number of waves per execution unit required to support the given
321 /// \p FlatWorkGroupSize.
322 virtual unsigned
323 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
324
325 /// \returns Minimum number of waves per execution unit supported by the
326 /// subtarget.
327 virtual unsigned getMinWavesPerEU() const = 0;
328
329 /// \returns Maximum number of waves per execution unit supported by the
330 /// subtarget without any kind of limitation.
331 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
332
333 /// Return the maximum workitem ID value in the function, for the given (0, 1,
334 /// 2) dimension.
335 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
336
337 /// Return the number of work groups for the function.
339
340 /// Return true if only a single workitem can be active in a wave.
341 bool isSingleLaneExecution(const Function &Kernel) const;
342
343 /// Creates value range metadata on an workitemid.* intrinsic call or load.
345
346 /// \returns Number of bytes of arguments that are passed to a shader or
347 /// kernel in addition to the explicit ones declared for the function.
348 unsigned getImplicitArgNumBytes(const Function &F) const;
349 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
350 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
351
352 /// \returns Corresponding DWARF register number mapping flavour for the
353 /// \p WavefrontSize.
355
356 virtual ~AMDGPUSubtarget() = default;
357};
358
359} // end namespace llvm
360
361#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasFP8ConversionScaleInsts() const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool hasFP4ConversionScaleInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
bool hasBF16ConversionInsts() const
bool hasFP6BF6ConversionScaleInsts() const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
bool hasBF8ConversionScaleInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
bool isGCN3Encoding() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool HasF16BF16ToFP6BF6ConversionScaleInsts
bool has16BitInsts() const
virtual ~AMDGPUSubtarget()=default
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool isPromoteAllocaEnabled() const
bool hasTrigReducedRange() const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
bool hasDsSrc2Insts() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
bool hasInv2PiInlineImm() const
bool hasF32ToF16BF16ConversionSRInsts() const
bool hasVOP3PInsts() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39