LLVM 20.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/IR/CallingConv.h"
20
21namespace llvm {
22
24class Function;
25class Instruction;
26class MachineFunction;
27class TargetMachine;
28
30public:
33 R600 = 1,
34 R700 = 2,
40 GFX9 = 8,
41 GFX10 = 9,
42 GFX11 = 10,
43 GFX12 = 11,
44 };
45
46private:
47 Triple TargetTriple;
48
49protected:
50 bool GCN3Encoding = false;
51 bool Has16BitInsts = false;
52 bool HasTrue16BitInsts = false;
58 bool HasCvtPkF16F32Inst = false;
62 bool HasMadMixInsts = false;
63 bool HasMadMacF32Insts = false;
64 bool HasDsSrc2Insts = false;
65 bool HasSDWA = false;
66 bool HasVOP3PInsts = false;
67 bool HasMulI24 = true;
68 bool HasMulU24 = true;
69 bool HasSMulHi = false;
70 bool HasInv2PiInlineImm = false;
71 bool HasFminFmaxLegacy = true;
72 bool EnablePromoteAlloca = false;
73 bool HasTrigReducedRange = false;
74 bool FastFMAF32 = false;
75 unsigned EUsPerCU = 4;
76 unsigned MaxWavesPerEU = 10;
77 unsigned LocalMemorySize = 0;
80
81public:
83
84 static const AMDGPUSubtarget &get(const MachineFunction &MF);
85 static const AMDGPUSubtarget &get(const TargetMachine &TM,
86 const Function &F);
87
88 /// \returns Default range flat work group size for a calling convention.
89 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
90
91 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
92 /// for function \p F, or minimum/maximum flat work group sizes explicitly
93 /// requested using "amdgpu-flat-work-group-size" attribute attached to
94 /// function \p F.
95 ///
96 /// \returns Subtarget's default values if explicitly requested values cannot
97 /// be converted to integer, or violate subtarget's specifications.
98 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
99
100 /// \returns Subtarget's default pair of minimum/maximum number of waves per
101 /// execution unit for function \p F, or minimum/maximum number of waves per
102 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
103 /// attached to function \p F.
104 ///
105 /// \returns Subtarget's default values if explicitly requested values cannot
106 /// be converted to integer, violate subtarget's specifications, or are not
107 /// compatible with minimum/maximum number of waves limited by flat work group
108 /// size, register usage, and/or lds usage.
109 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
110 // Default/requested minimum/maximum flat work group sizes.
111 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
112 return getWavesPerEU(F, FlatWorkGroupSizes);
113 }
114
115 /// Overload which uses the specified values for the flat work group sizes,
116 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
117 /// correspond to the function's value for getFlatWorkGroupSizes.
118 std::pair<unsigned, unsigned>
119 getWavesPerEU(const Function &F,
120 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
121 std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
122 std::pair<unsigned, unsigned> WavesPerEU,
123 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
124
125 /// Return the amount of LDS that can be used that will not restrict the
126 /// occupancy lower than WaveCount.
127 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
128 const Function &) const;
129
130 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
131 /// the given LDS memory size is the only constraint.
132 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
133
134 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
135
136 bool isAmdHsaOS() const {
137 return TargetTriple.getOS() == Triple::AMDHSA;
138 }
139
140 bool isAmdPalOS() const {
141 return TargetTriple.getOS() == Triple::AMDPAL;
142 }
143
144 bool isMesa3DOS() const {
145 return TargetTriple.getOS() == Triple::Mesa3D;
146 }
147
148 bool isMesaKernel(const Function &F) const;
149
150 bool isAmdHsaOrMesa(const Function &F) const {
151 return isAmdHsaOS() || isMesaKernel(F);
152 }
153
154 bool isGCN() const {
155 return TargetTriple.getArch() == Triple::amdgcn;
156 }
157
158 bool isGCN3Encoding() const {
159 return GCN3Encoding;
160 }
161
162 bool has16BitInsts() const {
163 return Has16BitInsts;
164 }
165
166 /// Return true if the subtarget supports True16 instructions.
167 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
168
169 /// Return true if real (non-fake) variants of True16 instructions using
170 /// 16-bit registers should be code-generated. Fake True16 instructions are
171 /// identical to non-fake ones except that they take 32-bit registers as
172 /// operands and always use their low halves.
173 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
174 // supported and the support for fake True16 instructions is removed.
175 bool useRealTrue16Insts() const;
176
179 }
180
181 bool hasMadMixInsts() const {
182 return HasMadMixInsts;
183 }
184
186
188
190
192
194
195 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
196
199 }
200
201 bool hasMadMacF32Insts() const {
202 return HasMadMacF32Insts || !isGCN();
203 }
204
205 bool hasDsSrc2Insts() const {
206 return HasDsSrc2Insts;
207 }
208
209 bool hasSDWA() const {
210 return HasSDWA;
211 }
212
213 bool hasVOP3PInsts() const {
214 return HasVOP3PInsts;
215 }
216
217 bool hasMulI24() const {
218 return HasMulI24;
219 }
220
221 bool hasMulU24() const {
222 return HasMulU24;
223 }
224
225 bool hasSMulHi() const {
226 return HasSMulHi;
227 }
228
229 bool hasInv2PiInlineImm() const {
230 return HasInv2PiInlineImm;
231 }
232
233 bool hasFminFmaxLegacy() const {
234 return HasFminFmaxLegacy;
235 }
236
237 bool hasTrigReducedRange() const {
238 return HasTrigReducedRange;
239 }
240
241 bool hasFastFMAF32() const {
242 return FastFMAF32;
243 }
244
246 return EnablePromoteAlloca;
247 }
248
249 unsigned getWavefrontSize() const {
250 return 1 << WavefrontSizeLog2;
251 }
252
253 unsigned getWavefrontSizeLog2() const {
254 return WavefrontSizeLog2;
255 }
256
257 /// Return the maximum number of bytes of LDS available for all workgroups
258 /// running on the same WGP or CU.
259 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
260 /// limited to 64k.
261 unsigned getLocalMemorySize() const {
262 return LocalMemorySize;
263 }
264
265 /// Return the maximum number of bytes of LDS that can be allocated to a
266 /// single workgroup.
267 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
268 /// 128k in total.
271 }
272
273 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
274 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
275 /// CU mode into account.
276 unsigned getEUsPerCU() const { return EUsPerCU; }
277
279 return isAmdHsaOS() ? Align(8) : Align(4);
280 }
281
282 /// Returns the offset in bytes from the start of the input buffer
283 /// of the first explicit kernel argument.
284 unsigned getExplicitKernelArgOffset() const {
285 switch (TargetTriple.getOS()) {
286 case Triple::AMDHSA:
287 case Triple::AMDPAL:
288 case Triple::Mesa3D:
289 return 0;
291 default:
292 // For legacy reasons unknown/other is treated as a different version of
293 // mesa.
294 return 36;
295 }
296
297 llvm_unreachable("invalid triple OS");
298 }
299
300 /// \returns Maximum number of work groups per compute unit supported by the
301 /// subtarget and limited by given \p FlatWorkGroupSize.
302 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
303
304 /// \returns Minimum flat work group size supported by the subtarget.
305 virtual unsigned getMinFlatWorkGroupSize() const = 0;
306
307 /// \returns Maximum flat work group size supported by the subtarget.
308 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
309
310 /// \returns Number of waves per execution unit required to support the given
311 /// \p FlatWorkGroupSize.
312 virtual unsigned
313 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
314
315 /// \returns Minimum number of waves per execution unit supported by the
316 /// subtarget.
317 virtual unsigned getMinWavesPerEU() const = 0;
318
319 /// \returns Maximum number of waves per execution unit supported by the
320 /// subtarget without any kind of limitation.
321 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
322
323 /// Return the maximum workitem ID value in the function, for the given (0, 1,
324 /// 2) dimension.
325 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
326
327 /// Return the number of work groups for the function.
329
330 /// Return true if only a single workitem can be active in a wave.
331 bool isSingleLaneExecution(const Function &Kernel) const;
332
333 /// Creates value range metadata on an workitemid.* intrinsic call or load.
335
336 /// \returns Number of bytes of arguments that are passed to a shader or
337 /// kernel in addition to the explicit ones declared for the function.
338 unsigned getImplicitArgNumBytes(const Function &F) const;
339 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
340 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
341
342 /// \returns Corresponding DWARF register number mapping flavour for the
343 /// \p WavefrontSize.
345
346 virtual ~AMDGPUSubtarget() = default;
347};
348
349} // end namespace llvm
350
351#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasFP8ConversionScaleInsts() const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool hasFP4ConversionScaleInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
bool hasBF16ConversionInsts() const
bool hasFP6BF6ConversionScaleInsts() const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
bool hasBF8ConversionScaleInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
bool isGCN3Encoding() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool HasF16BF16ToFP6BF6ConversionScaleInsts
bool has16BitInsts() const
virtual ~AMDGPUSubtarget()=default
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool isPromoteAllocaEnabled() const
bool hasTrigReducedRange() const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
bool hasDsSrc2Insts() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
bool hasInv2PiInlineImm() const
bool hasF32ToF16BF16ConversionSRInsts() const
bool hasVOP3PInsts() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:383
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39