LLVM  4.0.0
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
31 
34  StringRef GPU, StringRef FS) {
35  // Determine default and user-specified characteristics
36  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37  // enabled, but some instructions do not respect them and they run at the
38  // double precision rate, so don't enable by default.
39  //
40  // We want to be able to turn these off, but making this a subtarget feature
41  // for SI has the unhelpful behavior that it unsets everything else if you
42  // disable it.
43 
44  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
45  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46  FullFS += "+flat-for-global,+unaligned-buffer-access,";
47  FullFS += FS;
48 
49  ParseSubtargetFeatures(GPU, FullFS);
50 
51  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
52  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
53  // variants of MUBUF instructions.
54  if (!hasAddr64() && !FS.contains("flat-for-global")) {
55  FlatForGlobal = true;
56  }
57 
58  // FIXME: I don't think think Evergreen has any useful support for
59  // denormals, but should be checked. Should we issue a warning somewhere
60  // if someone tries to enable these?
62  FP16Denormals = false;
63  FP32Denormals = false;
64  FP64Denormals = false;
65  }
66 
67  // Set defaults if needed.
68  if (MaxPrivateElementSize == 0)
70 
71  return *this;
72 }
73 
75  const TargetMachine &TM)
76  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
77  TargetTriple(TT),
78  Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
79  IsaVersion(ISAVersion0_0_0),
80  WavefrontSize(64),
81  LocalMemorySize(0),
82  LDSBankCount(0),
83  MaxPrivateElementSize(0),
84 
85  FastFMAF32(false),
86  HalfRate64Ops(false),
87 
88  FP16Denormals(false),
89  FP32Denormals(false),
90  FP64Denormals(false),
91  FPExceptions(false),
92  FlatForGlobal(false),
93  UnalignedScratchAccess(false),
94  UnalignedBufferAccess(false),
95 
96  EnableXNACK(false),
97  DebuggerInsertNops(false),
98  DebuggerReserveRegs(false),
99  DebuggerEmitPrologue(false),
100 
101  EnableVGPRSpilling(false),
102  EnablePromoteAlloca(false),
104  EnableUnsafeDSOffsetFolding(false),
105  EnableSIScheduler(false),
106  DumpCode(false),
107 
108  FP64(false),
109  IsGCN(false),
110  GCN1Encoding(false),
111  GCN3Encoding(false),
112  CIInsts(false),
113  SGPRInitBug(false),
114  HasSMemRealTime(false),
115  Has16BitInsts(false),
116  HasMovrel(false),
117  HasVGPRIndexMode(false),
118  HasScalarStores(false),
119  HasInv2PiInlineImm(false),
120  FlatAddressSpace(false),
121 
122  R600ALUInst(false),
123  CaymanISA(false),
124  CFALUBug(false),
125  HasVertexCache(false),
126  TexVTXClauseSize(0),
128 
129  FeatureDisable(false),
130  InstrItins(getInstrItineraryForCPU(GPU)) {
131  initializeSubtargetDependencies(TT, GPU, FS);
132 }
133 
134 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
135 // size?
136 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
137  switch (NWaves) {
138  case 10:
139  return 1638;
140  case 9:
141  return 1820;
142  case 8:
143  return 2048;
144  case 7:
145  return 2340;
146  case 6:
147  return 2730;
148  case 5:
149  return 3276;
150  case 4:
151  return 4096;
152  case 3:
153  return 5461;
154  case 2:
155  return 8192;
156  default:
157  return getLocalMemorySize();
158  }
159 }
160 
162  if (Bytes <= 1638)
163  return 10;
164 
165  if (Bytes <= 1820)
166  return 9;
167 
168  if (Bytes <= 2048)
169  return 8;
170 
171  if (Bytes <= 2340)
172  return 7;
173 
174  if (Bytes <= 2730)
175  return 6;
176 
177  if (Bytes <= 3276)
178  return 5;
179 
180  if (Bytes <= 4096)
181  return 4;
182 
183  if (Bytes <= 5461)
184  return 3;
185 
186  if (Bytes <= 8192)
187  return 2;
188 
189  return 1;
190 }
191 
192 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
193  const Function &F) const {
194  // Default minimum/maximum flat work group sizes.
195  std::pair<unsigned, unsigned> Default =
197  std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
198  getWavefrontSize() * 4) :
199  std::pair<unsigned, unsigned>(1, getWavefrontSize());
200 
201  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
202  // starts using "amdgpu-flat-work-group-size" attribute.
203  Default.second = AMDGPU::getIntegerAttribute(
204  F, "amdgpu-max-work-group-size", Default.second);
205  Default.first = std::min(Default.first, Default.second);
206 
207  // Requested minimum/maximum flat work group sizes.
208  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
209  F, "amdgpu-flat-work-group-size", Default);
210 
211  // Make sure requested minimum is less than requested maximum.
212  if (Requested.first > Requested.second)
213  return Default;
214 
215  // Make sure requested values do not violate subtarget's specifications.
216  if (Requested.first < getMinFlatWorkGroupSize())
217  return Default;
218  if (Requested.second > getMaxFlatWorkGroupSize())
219  return Default;
220 
221  return Requested;
222 }
223 
224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
225  const Function &F) const {
226  // Default minimum/maximum number of waves per execution unit.
227  std::pair<unsigned, unsigned> Default(1, 0);
228 
229  // Default/requested minimum/maximum flat work group sizes.
230  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
231 
232  // If minimum/maximum flat work group sizes were explicitly requested using
233  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
234  // number of waves per execution unit to values implied by requested
235  // minimum/maximum flat work group sizes.
236  unsigned MinImpliedByFlatWorkGroupSize =
237  getMaxWavesPerEU(FlatWorkGroupSizes.second);
238  bool RequestedFlatWorkGroupSize = false;
239 
240  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241  // starts using "amdgpu-flat-work-group-size" attribute.
242  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
243  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
244  Default.first = MinImpliedByFlatWorkGroupSize;
245  RequestedFlatWorkGroupSize = true;
246  }
247 
248  // Requested minimum/maximum number of waves per execution unit.
249  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
250  F, "amdgpu-waves-per-eu", Default, true);
251 
252  // Make sure requested minimum is less than requested maximum.
253  if (Requested.second && Requested.first > Requested.second)
254  return Default;
255 
256  // Make sure requested values do not violate subtarget's specifications.
257  if (Requested.first < getMinWavesPerEU() ||
258  Requested.first > getMaxWavesPerEU())
259  return Default;
260  if (Requested.second > getMaxWavesPerEU())
261  return Default;
262 
263  // Make sure requested values are compatible with values implied by requested
264  // minimum/maximum flat work group sizes.
265  if (RequestedFlatWorkGroupSize &&
266  Requested.first > MinImpliedByFlatWorkGroupSize)
267  return Default;
268 
269  return Requested;
270 }
271 
273  const TargetMachine &TM) :
274  AMDGPUSubtarget(TT, GPU, FS, TM),
275  InstrInfo(*this),
276  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
277  TLInfo(TM, *this) {}
278 
280  const TargetMachine &TM) :
281  AMDGPUSubtarget(TT, GPU, FS, TM),
282  InstrInfo(*this),
283  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
284  TLInfo(TM, *this) {}
285 
287  unsigned NumRegionInstrs) const {
288  // Track register pressure so the scheduler can try to decrease
289  // pressure once register usage is above the threshold defined by
290  // SIRegisterInfo::getRegPressureSetLimit()
291  Policy.ShouldTrackPressure = true;
292 
293  // Enabling both top down and bottom up scheduling seems to give us less
294  // register spills than just using one of these approaches on its own.
295  Policy.OnlyTopDown = false;
296  Policy.OnlyBottomUp = false;
297 
298  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
299  if (!enableSIScheduler())
300  Policy.ShouldTrackLaneMasks = true;
301 }
302 
305 }
306 
308  unsigned ExplicitArgBytes) const {
309  unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
310  if (ImplicitBytes == 0)
311  return ExplicitArgBytes;
312 
313  unsigned Alignment = getAlignmentForImplicitArgPtr();
314  return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
315 }
316 
317 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
319  if (SGPRs <= 80)
320  return 10;
321  if (SGPRs <= 88)
322  return 9;
323  if (SGPRs <= 100)
324  return 8;
325  return 7;
326  }
327  if (SGPRs <= 48)
328  return 10;
329  if (SGPRs <= 56)
330  return 9;
331  if (SGPRs <= 64)
332  return 8;
333  if (SGPRs <= 72)
334  return 7;
335  if (SGPRs <= 80)
336  return 6;
337  return 5;
338 }
339 
340 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
341  if (VGPRs <= 24)
342  return 10;
343  if (VGPRs <= 28)
344  return 9;
345  if (VGPRs <= 32)
346  return 8;
347  if (VGPRs <= 36)
348  return 7;
349  if (VGPRs <= 40)
350  return 6;
351  if (VGPRs <= 48)
352  return 5;
353  if (VGPRs <= 64)
354  return 4;
355  if (VGPRs <= 84)
356  return 3;
357  if (VGPRs <= 128)
358  return 2;
359  return 1;
360 }
361 
362 unsigned SISubtarget::getMaxNumSGPRs() const {
363  if (hasSGPRInitBug())
365 
367  return 102;
368 
369  return 104;
370 }
AMDGPU specific subclass of TargetSubtarget.
bool isVGPRSpillingEnabled(const Function &F) const
AMDGPUSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:664
int getLocalMemorySize() const
unsigned getImplicitArgNumBytes(const MachineFunction &MF) const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:165
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool enableSIScheduler() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
#define F(x, y, z)
Definition: MD5.cpp:51
SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
Function Alias Analysis false
Generation getGeneration() const
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:445
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
bool hasSGPRInitBug() const
unsigned getMaxWavesPerEU() const
bool isShader(CallingConv::ID cc)
unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
unsigned getMinFlatWorkGroupSize() const
bool isCompute(CallingConv::ID cc)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(false), cl::Hidden)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
unsigned getMaxFlatWorkGroupSize() const
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMinWavesPerEU() const
Information about stack frame layout on the target.
bool isAmdHsaOS() const
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:226
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
~AMDGPUSubtarget() override
unsigned getWavefrontSize() const
AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair"" optimization pass"), cl::init(true), cl::Hidden)
Primary interface to the complete machine description for the target machine.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getMaxNumSGPRs() const
unsigned getAlignmentForImplicitArgPtr() const