LLVM 20.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/IR/CallingConv.h"
20
21namespace llvm {
22
24class Function;
25class Instruction;
26class MachineFunction;
27class TargetMachine;
28
30public:
33 R600 = 1,
34 R700 = 2,
40 GFX9 = 8,
41 GFX10 = 9,
42 GFX11 = 10,
43 GFX12 = 11,
44 };
45
46private:
47 Triple TargetTriple;
48
49protected:
50 bool GCN3Encoding = false;
51 bool Has16BitInsts = false;
52 bool HasTrue16BitInsts = false;
54 bool HasMadMixInsts = false;
55 bool HasMadMacF32Insts = false;
56 bool HasDsSrc2Insts = false;
57 bool HasSDWA = false;
58 bool HasVOP3PInsts = false;
59 bool HasMulI24 = true;
60 bool HasMulU24 = true;
61 bool HasSMulHi = false;
62 bool HasInv2PiInlineImm = false;
63 bool HasFminFmaxLegacy = true;
64 bool EnablePromoteAlloca = false;
65 bool HasTrigReducedRange = false;
66 bool FastFMAF32 = false;
67 unsigned EUsPerCU = 4;
68 unsigned MaxWavesPerEU = 10;
69 unsigned LocalMemorySize = 0;
72
73public:
75
76 static const AMDGPUSubtarget &get(const MachineFunction &MF);
77 static const AMDGPUSubtarget &get(const TargetMachine &TM,
78 const Function &F);
79
80 /// \returns Default range flat work group size for a calling convention.
81 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
82
83 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
84 /// for function \p F, or minimum/maximum flat work group sizes explicitly
85 /// requested using "amdgpu-flat-work-group-size" attribute attached to
86 /// function \p F.
87 ///
88 /// \returns Subtarget's default values if explicitly requested values cannot
89 /// be converted to integer, or violate subtarget's specifications.
90 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
91
92 /// \returns Subtarget's default pair of minimum/maximum number of waves per
93 /// execution unit for function \p F, or minimum/maximum number of waves per
94 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
95 /// attached to function \p F.
96 ///
97 /// \returns Subtarget's default values if explicitly requested values cannot
98 /// be converted to integer, violate subtarget's specifications, or are not
99 /// compatible with minimum/maximum number of waves limited by flat work group
100 /// size, register usage, and/or lds usage.
101 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
102 // Default/requested minimum/maximum flat work group sizes.
103 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
104 return getWavesPerEU(F, FlatWorkGroupSizes);
105 }
106
107 /// Overload which uses the specified values for the flat work group sizes,
108 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
109 /// correspond to the function's value for getFlatWorkGroupSizes.
110 std::pair<unsigned, unsigned>
111 getWavesPerEU(const Function &F,
112 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
113 std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
114 std::pair<unsigned, unsigned> WavesPerEU,
115 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
116
117 /// Return the amount of LDS that can be used that will not restrict the
118 /// occupancy lower than WaveCount.
119 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
120 const Function &) const;
121
122 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
123 /// the given LDS memory size is the only constraint.
124 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
125
126 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
127
128 bool isAmdHsaOS() const {
129 return TargetTriple.getOS() == Triple::AMDHSA;
130 }
131
132 bool isAmdPalOS() const {
133 return TargetTriple.getOS() == Triple::AMDPAL;
134 }
135
136 bool isMesa3DOS() const {
137 return TargetTriple.getOS() == Triple::Mesa3D;
138 }
139
140 bool isMesaKernel(const Function &F) const;
141
142 bool isAmdHsaOrMesa(const Function &F) const {
143 return isAmdHsaOS() || isMesaKernel(F);
144 }
145
146 bool isGCN() const {
147 return TargetTriple.getArch() == Triple::amdgcn;
148 }
149
150 bool isGCN3Encoding() const {
151 return GCN3Encoding;
152 }
153
154 bool has16BitInsts() const {
155 return Has16BitInsts;
156 }
157
158 /// Return true if the subtarget supports True16 instructions.
159 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
160
161 /// Return true if real (non-fake) variants of True16 instructions using
162 /// 16-bit registers should be code-generated. Fake True16 instructions are
163 /// identical to non-fake ones except that they take 32-bit registers as
164 /// operands and always use their low halves.
165 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
166 // supported and the support for fake True16 instructions is removed.
167 bool useRealTrue16Insts() const;
168
169 bool hasMadMixInsts() const {
170 return HasMadMixInsts;
171 }
172
173 bool hasMadMacF32Insts() const {
174 return HasMadMacF32Insts || !isGCN();
175 }
176
177 bool hasDsSrc2Insts() const {
178 return HasDsSrc2Insts;
179 }
180
181 bool hasSDWA() const {
182 return HasSDWA;
183 }
184
185 bool hasVOP3PInsts() const {
186 return HasVOP3PInsts;
187 }
188
189 bool hasMulI24() const {
190 return HasMulI24;
191 }
192
193 bool hasMulU24() const {
194 return HasMulU24;
195 }
196
197 bool hasSMulHi() const {
198 return HasSMulHi;
199 }
200
201 bool hasInv2PiInlineImm() const {
202 return HasInv2PiInlineImm;
203 }
204
205 bool hasFminFmaxLegacy() const {
206 return HasFminFmaxLegacy;
207 }
208
209 bool hasTrigReducedRange() const {
210 return HasTrigReducedRange;
211 }
212
213 bool hasFastFMAF32() const {
214 return FastFMAF32;
215 }
216
218 return EnablePromoteAlloca;
219 }
220
221 unsigned getWavefrontSize() const {
222 return 1 << WavefrontSizeLog2;
223 }
224
225 unsigned getWavefrontSizeLog2() const {
226 return WavefrontSizeLog2;
227 }
228
229 unsigned getLocalMemorySize() const {
230 return LocalMemorySize;
231 }
232
235 }
236
237 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
238 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
239 /// CU mode into account.
240 unsigned getEUsPerCU() const { return EUsPerCU; }
241
243 return isAmdHsaOS() ? Align(8) : Align(4);
244 }
245
246 /// Returns the offset in bytes from the start of the input buffer
247 /// of the first explicit kernel argument.
248 unsigned getExplicitKernelArgOffset() const {
249 switch (TargetTriple.getOS()) {
250 case Triple::AMDHSA:
251 case Triple::AMDPAL:
252 case Triple::Mesa3D:
253 return 0;
255 default:
256 // For legacy reasons unknown/other is treated as a different version of
257 // mesa.
258 return 36;
259 }
260
261 llvm_unreachable("invalid triple OS");
262 }
263
264 /// \returns Maximum number of work groups per compute unit supported by the
265 /// subtarget and limited by given \p FlatWorkGroupSize.
266 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
267
268 /// \returns Minimum flat work group size supported by the subtarget.
269 virtual unsigned getMinFlatWorkGroupSize() const = 0;
270
271 /// \returns Maximum flat work group size supported by the subtarget.
272 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
273
274 /// \returns Number of waves per execution unit required to support the given
275 /// \p FlatWorkGroupSize.
276 virtual unsigned
277 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
278
279 /// \returns Minimum number of waves per execution unit supported by the
280 /// subtarget.
281 virtual unsigned getMinWavesPerEU() const = 0;
282
283 /// \returns Maximum number of waves per execution unit supported by the
284 /// subtarget without any kind of limitation.
285 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
286
287 /// Return the maximum workitem ID value in the function, for the given (0, 1,
288 /// 2) dimension.
289 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290
291 /// Return the number of work groups for the function.
293
294 /// Return true if only a single workitem can be active in a wave.
295 bool isSingleLaneExecution(const Function &Kernel) const;
296
297 /// Creates value range metadata on an workitemid.* intrinsic call or load.
299
300 /// \returns Number of bytes of arguments that are passed to a shader or
301 /// kernel in addition to the explicit ones declared for the function.
302 unsigned getImplicitArgNumBytes(const Function &F) const;
303 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
304 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
305
306 /// \returns Corresponding DWARF register number mapping flavour for the
307 /// \p WavefrontSize.
309
310 virtual ~AMDGPUSubtarget() = default;
311};
312
313} // end namespace llvm
314
315#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
const char LLVMTargetMachineRef TM
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
unsigned getAddressableLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
bool isGCN3Encoding() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool has16BitInsts() const
virtual ~AMDGPUSubtarget()=default
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool isPromoteAllocaEnabled() const
bool hasTrigReducedRange() const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
bool hasDsSrc2Insts() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39