LLVM 22.0.0git
NVPTXSubtarget.h
Go to the documentation of this file.
1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
22#include "llvm/IR/DataLayout.h"
25#include <string>
26
27#define GET_SUBTARGETINFO_HEADER
28#include "NVPTXGenSubtargetInfo.inc"
29
30namespace llvm {
31
33 virtual void anchor();
34 std::string TargetName;
35
36 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
37 unsigned PTXVersion;
38
39 // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
40 // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
41 // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
42 unsigned int FullSmVersion;
43
44 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
45 // FullSmVersion.
46 unsigned int SmVersion;
47
48 NVPTXInstrInfo InstrInfo;
50 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
51
52 // NVPTX does not have any call stack frame, but need a NVPTX specific
53 // FrameLowering class because TargetFrameLowering is abstract.
54 NVPTXFrameLowering FrameLowering;
55
56public:
57 /// This constructor initializes the data members to match that
58 /// of the specified module.
59 ///
60 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
61 const std::string &FS, const NVPTXTargetMachine &TM);
62
63 ~NVPTXSubtarget() override;
64
65 const TargetFrameLowering *getFrameLowering() const override {
66 return &FrameLowering;
67 }
68 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
69 const NVPTXRegisterInfo *getRegisterInfo() const override {
70 return &InstrInfo.getRegisterInfo();
71 }
72 const NVPTXTargetLowering *getTargetLowering() const override {
73 return &TLInfo;
74 }
75
76 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
77
78 // Checks PTX version and family-specific and architecture-specific SM
79 // versions. For example, sm_100{f/a} and any future variants in the same
80 // family will match for any PTX version greater than or equal to
81 // `PTXVersion`.
82 bool hasPTXWithFamilySMs(unsigned PTXVersion,
83 ArrayRef<unsigned> SMVersions) const;
84 // Checks PTX version and architecture-specific SM versions.
85 // For example, sm_100{a} will match for any PTX version greater than or equal
86 // to `PTXVersion`.
87 bool hasPTXWithAccelSMs(unsigned PTXVersion,
88 ArrayRef<unsigned> SMVersions) const;
89
90 bool has256BitVectorLoadStore(unsigned AS) const {
91 return SmVersion >= 100 && PTXVersion >= 88 &&
93 }
95 return SmVersion >= 50 && PTXVersion >= 83;
96 }
97 bool hasAtomAddF64() const { return SmVersion >= 60; }
98 bool hasAtomScope() const { return SmVersion >= 60; }
99 bool hasAtomBitwise64() const { return SmVersion >= 32; }
100 bool hasAtomMinMax64() const { return SmVersion >= 32; }
101 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
102 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
103 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
104 bool hasLDG() const { return SmVersion >= 32; }
105 bool hasHWROT32() const { return SmVersion >= 32; }
106 bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
107 bool hasFP16Math() const { return SmVersion >= 53; }
108 bool hasBF16Math() const { return SmVersion >= 80; }
109 bool allowFP16Math() const;
110 bool hasMaskOperator() const { return PTXVersion >= 71; }
111 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
112 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
113 // release, acq_rel, sc) ?
114 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
115 // Does SM & PTX support .acquire and .release qualifiers for fence?
117 return SmVersion >= 90 && PTXVersion >= 86;
118 }
119 // Does SM & PTX support atomic relaxed MMIO operations ?
120 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
121 bool hasDotInstructions() const {
122 return SmVersion >= 61 && PTXVersion >= 50;
123 }
124
125 // Checks following instructions support:
126 // - tcgen05.ld/st
127 // - tcgen05.alloc/dealloc/relinquish
128 // - tcgen05.cp
129 // - tcgen05.fence/wait
130 // - tcgen05.commit
131 // - tcgen05.mma
133 // sm_101 renamed to sm_110 in PTX 9.0
134 return hasPTXWithFamilySMs(90, {100, 110}) ||
135 hasPTXWithFamilySMs(88, {100, 101}) ||
136 hasPTXWithAccelSMs(86, {100, 101});
137 }
138
139 // Checks tcgen05.shift instruction support.
141 // sm_101 renamed to sm_110 in PTX 9.0
142 return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
143 hasPTXWithAccelSMs(88, {100, 101, 103}) ||
144 hasPTXWithAccelSMs(86, {100, 101});
145 }
146
148 return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100});
149 }
150
151 bool hasTcgen05MMAI8Kind() const {
152 return hasPTXWithAccelSMs(90, {100, 110}) ||
153 hasPTXWithAccelSMs(86, {100, 101});
154 }
155
157 return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
158 hasPTXWithAccelSMs(87, {100, 101, 103});
159 }
160
162 return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
163 hasPTXWithAccelSMs(86, {100, 101, 103});
164 }
165
166 bool hasReduxSyncF32() const {
167 return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100});
168 }
169
170 bool hasMMABlockScale() const {
171 return hasPTXWithFamilySMs(88, {120}) || hasPTXWithAccelSMs(87, {120});
172 }
173
175 return hasPTXWithAccelSMs(87, {120, 121});
176 }
177
178 // f32x2 instructions in Blackwell family
179 bool hasF32x2Instructions() const;
180
181 // Checks support for following in TMA:
182 // - cta_group::1/2 support
183 // - im2col_w/w_128 mode support
184 // - tile_gather4 mode support
185 // - tile_scatter4 mode support
187 return hasPTXWithFamilySMs(90, {100, 110}) ||
188 hasPTXWithFamilySMs(88, {100, 101}) ||
189 hasPTXWithAccelSMs(86, {100, 101});
190 }
191
192 // Checks support for conversions involving e4m3x2 and e5m2x2.
194 if (PTXVersion >= 81)
195 return SmVersion >= 89;
196
197 if (PTXVersion >= 78)
198 return SmVersion >= 90;
199
200 return false;
201 }
202
203 // Checks support for conversions involving the following types:
204 // - e2m3x2/e3m2x2
205 // - e2m1x2
206 // - ue8m0x2
208 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
209 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
210 hasPTXWithAccelSMs(86, {100, 101, 120});
211 }
212
214 return hasPTXWithFamilySMs(90, {90, 100, 110, 120}) ||
215 hasPTXWithFamilySMs(88, {90, 100, 101, 120}) ||
216 hasPTXWithAccelSMs(83, {90, 100, 101, 120});
217 }
218
219 bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
220 if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
221 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
222 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
223 hasPTXWithAccelSMs(87, {100, 101, 120});
224
226 }
227
229 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
230 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
231 hasPTXWithAccelSMs(87, {100, 101, 120});
232 }
233
234 bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
235 if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
236 return hasPTXWithAccelSMs(88, {103});
237
239 }
240
242 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
243 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
244 hasPTXWithAccelSMs(86, {100, 101, 120});
245 }
246
247 bool hasSetMaxNRegSupport() const {
248 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
249 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
250 hasPTXWithAccelSMs(86, {100, 101, 120}) ||
251 hasPTXWithAccelSMs(80, {90});
252 }
253
255 return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
256 hasPTXWithFamilySMs(88, {100, 101, 120}) ||
257 hasPTXWithAccelSMs(86, {100, 101, 120});
258 }
259
260 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
261 // terminates a basic block. Instead, it would assume that control flow
262 // continued to the next instruction. The next instruction could be in the
263 // block that's lexically below it. This would lead to a phantom CFG edges
264 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
265 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
266 // present.
267 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
268 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
270 return hasPTXWithAccelSMs(87, {100, 103});
271 }
272 unsigned int getFullSmVersion() const { return FullSmVersion; }
273 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
274 unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
275 // GPUs with "a" suffix have architecture-accelerated features that are
276 // supported on the specified architecture only, hence such targets do not
277 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
278 // such GPU variants from the base GPU architecture.
279 // - false represents non-accelerated architecture.
280 // - true represents architecture-accelerated variant.
281 bool hasArchAccelFeatures() const {
282 return (getFullSmVersion() & 1) && PTXVersion >= 80;
283 }
284 // GPUs with 'f' suffix have architecture-accelerated features which are
285 // portable across all future architectures under same SM major. For example,
286 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
287 // - false represents non-family-specific architecture.
288 // - true represents family-specific variant.
290 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
292 }
293 // If the user did not provide a target we default to the `sm_30` target.
294 std::string getTargetName() const {
295 return TargetName.empty() ? "sm_30" : TargetName;
296 }
297 bool hasTargetName() const { return !TargetName.empty(); }
298
299 bool hasNativeBF16Support(int Opcode) const;
300
301 // Get maximum value of required alignments among the supported data types.
302 // From the PTX ISA doc, section 8.2.3:
303 // The memory consistency model relates operations executed on memory
304 // locations with scalar data-types, which have a maximum size and alignment
305 // of 64 bits. Memory operations with a vector data-type are modelled as a
306 // set of equivalent memory operations with a scalar data-type, executed in
307 // an unspecified order on the elements in the vector.
308 unsigned getMaxRequiredAlignment() const { return 8; }
309 // Get the smallest cmpxchg word size that the hardware supports.
310 unsigned getMinCmpXchgSizeInBits() const { return 32; }
311
312 unsigned getPTXVersion() const { return PTXVersion; }
313
316
317 void failIfClustersUnsupported(std::string const &FailureMessage) const;
318};
319
320} // End llvm namespace
321
322#endif
NVPTX address space definition.
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool hasMMASparseBlockScaleF4() const
bool hasTcgen05MMASparseMxf4() const
const NVPTXInstrInfo * getInstrInfo() const override
void failIfClustersUnsupported(std::string const &FailureMessage) const
bool hasTcgen05MMAI8Kind() const
bool hasTMABlackwellSupport() const
bool hasPTXWithAccelSMs(unsigned PTXVersion, ArrayRef< unsigned > SMVersions) const
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
std::string getTargetName() const
bool hasTcgen05MMASparseMxf4nvf4() const
unsigned getMaxRequiredAlignment() const
bool hasClusterLaunchControlTryCancelMulticastSupport() const
bool hasAtomMinMax64() const
bool hasTcgen05InstSupport() const
bool hasAtomAddF64() const
bool hasSplitAcquireAndReleaseFences() const
bool hasConvertWithStochasticRounding() const
bool hasMaskOperator() const
bool hasFP8ConversionSupport() const
const NVPTXTargetLowering * getTargetLowering() const override
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
unsigned getMinCmpXchgSizeInBits() const
unsigned getPTXVersion() const
bool hasCvtaParam() const
~NVPTXSubtarget() override
bool hasNativeBF16Support(int Opcode) const
bool hasUsedBytesMaskPragma() const
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getFullSmVersion() const
bool hasTensormapReplaceSupport() const
unsigned int getSmVersion() const
bool hasDotInstructions() const
bool hasTcgen05MMAScaleInputDImm() const
bool hasFamilySpecificFeatures() const
bool hasAtomBitwise64() const
bool hasPTXWithFamilySMs(unsigned PTXVersion, ArrayRef< unsigned > SMVersions) const
bool hasTcgen05ShiftSupport() const
bool hasRelaxedMMIO() const
bool hasTargetName() const
bool hasSetMaxNRegSupport() const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool hasReduxSyncF32() const
unsigned int getSmFamilyVersion() const
const TargetFrameLowering * getFrameLowering() const override
bool hasAtomScope() const
bool hasMMABlockScale() const
bool hasLdStmatrixBlackwellSupport() const
bool hasAtomCas16() const
NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM)
This constructor initializes the data members to match that of the specified module.
bool hasNarrowFPConversionSupport() const
bool hasMemoryOrdering() const
bool hasArchAccelFeatures() const
NVPTXSubtarget & initializeSubtargetDependencies(StringRef CPU, StringRef FS)
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool has256BitVectorLoadStore(unsigned AS) const
bool hasTensormapReplaceSwizzleAtomicitySupport() const
bool hasPTXASUnreachableBug() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26