LLVM 20.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "R600Subtarget.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/IntrinsicsR600.h"
31#include "llvm/IR/MDBuilder.h"
33#include <algorithm>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "amdgpu-subtarget"
38
39#define GET_SUBTARGETINFO_TARGET_DESC
40#define GET_SUBTARGETINFO_CTOR
41#define AMDGPUSubtarget GCNSubtarget
42#include "AMDGPUGenSubtargetInfo.inc"
43#undef AMDGPUSubtarget
44
46 "amdgpu-enable-power-sched",
47 cl::desc("Enable scheduling to minimize mAI power bursts"),
48 cl::init(false));
49
51 "amdgpu-vgpr-index-mode",
52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53 cl::init(false));
54
55static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
56 cl::desc("Enable the use of AA during codegen."),
57 cl::init(true));
58
59static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60 cl::desc("Number of addresses from which to enable MIMG NSA."),
62
64
67 StringRef GPU, StringRef FS) {
68 // Determine default and user-specified characteristics
69 //
70 // We want to be able to turn these off, but making this a subtarget feature
71 // for SI has the unhelpful behavior that it unsets everything else if you
72 // disable it.
73 //
74 // Similarly we want enable-prt-strict-null to be on by default and not to
75 // unset everything else if it is disabled
76
77 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
78
79 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
80 if (isAmdHsaOS())
81 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
82
83 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
84
85 // Disable mutually exclusive bits.
86 if (FS.contains_insensitive("+wavefrontsize")) {
87 if (!FS.contains_insensitive("wavefrontsize16"))
88 FullFS += "-wavefrontsize16,";
89 if (!FS.contains_insensitive("wavefrontsize32"))
90 FullFS += "-wavefrontsize32,";
91 if (!FS.contains_insensitive("wavefrontsize64"))
92 FullFS += "-wavefrontsize64,";
93 }
94
95 FullFS += FS;
96
97 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
98
99 // Implement the "generic" processors, which acts as the default when no
100 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101 // the first amdgcn target that supports flat addressing. Other OSes defaults
102 // to the first amdgcn target.
106 }
107
108 if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
109 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
110 // If there is no default wave size it must be a generation before gfx10,
111 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
112 // set wave32 as a default.
113 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
114 }
115
116 // We don't support FP64 for EG/NI atm.
118
119 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
120 // support flat operations, otherwise they cannot access a 64-bit global
121 // address space
122 assert(hasAddr64() || hasFlat());
123 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124 // that do not support ADDR64 variants of MUBUF instructions. Such targets
125 // cannot use a 64 bit offset with a MUBUF instruction to access the global
126 // address space
127 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
128 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
129 FlatForGlobal = true;
130 }
131 // Unless +-flat-for-global is specified, use MUBUF instructions for global
132 // address space access if flat operations are not available.
133 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
134 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
135 FlatForGlobal = false;
136 }
137
138 // Set defaults if needed.
139 if (MaxPrivateElementSize == 0)
141
142 if (LDSBankCount == 0)
143 LDSBankCount = 32;
144
145 if (TT.getArch() == Triple::amdgcn) {
146 if (LocalMemorySize == 0)
147 LocalMemorySize = 32768;
148
149 // Do something sensible for unspecified target.
151 HasMovrel = true;
152 }
153
155
156 if (AMDGPU::isGFX10Plus(*this) &&
157 !getFeatureBits().test(AMDGPU::FeatureCuMode))
158 LocalMemorySize *= 2;
159
160 // Don't crash on invalid devices.
161 if (WavefrontSizeLog2 == 0)
163
166
168
169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170 << TargetID.getXnackSetting() << '\n');
171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172 << TargetID.getSramEccSetting() << '\n');
173
174 return *this;
175}
176
178 LLVMContext &Ctx = F.getContext();
179 if (hasFeature(AMDGPU::FeatureWavefrontSize32) ==
180 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
182 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
183 }
184}
185
187
190}
191
193 const GCNTargetMachine &TM)
194 : // clang-format off
195 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
196 AMDGPUSubtarget(TT),
197 TargetTriple(TT),
198 TargetID(*this),
199 InstrItins(getInstrItineraryForCPU(GPU)),
200 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
201 TLInfo(TM, *this),
202 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
203 // clang-format on
206 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
207 InlineAsmLoweringInfo =
208 std::make_unique<InlineAsmLowering>(getTargetLowering());
209 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
210 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
211 InstSelector =
212 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
213}
214
215unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
216 if (getGeneration() < GFX10)
217 return 1;
218
219 switch (Opcode) {
220 case AMDGPU::V_LSHLREV_B64_e64:
221 case AMDGPU::V_LSHLREV_B64_gfx10:
222 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
223 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
224 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
225 case AMDGPU::V_LSHL_B64_e64:
226 case AMDGPU::V_LSHRREV_B64_e64:
227 case AMDGPU::V_LSHRREV_B64_gfx10:
228 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
229 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
230 case AMDGPU::V_LSHR_B64_e64:
231 case AMDGPU::V_ASHRREV_I64_e64:
232 case AMDGPU::V_ASHRREV_I64_gfx10:
233 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
234 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
235 case AMDGPU::V_ASHR_I64_e64:
236 return 1;
237 }
238
239 return 2;
240}
241
242/// This list was mostly derived from experimentation.
243bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
244 switch (Opcode) {
245 case AMDGPU::V_CVT_F16_F32_e32:
246 case AMDGPU::V_CVT_F16_F32_e64:
247 case AMDGPU::V_CVT_F16_U16_e32:
248 case AMDGPU::V_CVT_F16_U16_e64:
249 case AMDGPU::V_CVT_F16_I16_e32:
250 case AMDGPU::V_CVT_F16_I16_e64:
251 case AMDGPU::V_RCP_F16_e64:
252 case AMDGPU::V_RCP_F16_e32:
253 case AMDGPU::V_RSQ_F16_e64:
254 case AMDGPU::V_RSQ_F16_e32:
255 case AMDGPU::V_SQRT_F16_e64:
256 case AMDGPU::V_SQRT_F16_e32:
257 case AMDGPU::V_LOG_F16_e64:
258 case AMDGPU::V_LOG_F16_e32:
259 case AMDGPU::V_EXP_F16_e64:
260 case AMDGPU::V_EXP_F16_e32:
261 case AMDGPU::V_SIN_F16_e64:
262 case AMDGPU::V_SIN_F16_e32:
263 case AMDGPU::V_COS_F16_e64:
264 case AMDGPU::V_COS_F16_e32:
265 case AMDGPU::V_FLOOR_F16_e64:
266 case AMDGPU::V_FLOOR_F16_e32:
267 case AMDGPU::V_CEIL_F16_e64:
268 case AMDGPU::V_CEIL_F16_e32:
269 case AMDGPU::V_TRUNC_F16_e64:
270 case AMDGPU::V_TRUNC_F16_e32:
271 case AMDGPU::V_RNDNE_F16_e64:
272 case AMDGPU::V_RNDNE_F16_e32:
273 case AMDGPU::V_FRACT_F16_e64:
274 case AMDGPU::V_FRACT_F16_e32:
275 case AMDGPU::V_FREXP_MANT_F16_e64:
276 case AMDGPU::V_FREXP_MANT_F16_e32:
277 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
278 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
279 case AMDGPU::V_LDEXP_F16_e64:
280 case AMDGPU::V_LDEXP_F16_e32:
281 case AMDGPU::V_LSHLREV_B16_e64:
282 case AMDGPU::V_LSHLREV_B16_e32:
283 case AMDGPU::V_LSHRREV_B16_e64:
284 case AMDGPU::V_LSHRREV_B16_e32:
285 case AMDGPU::V_ASHRREV_I16_e64:
286 case AMDGPU::V_ASHRREV_I16_e32:
287 case AMDGPU::V_ADD_U16_e64:
288 case AMDGPU::V_ADD_U16_e32:
289 case AMDGPU::V_SUB_U16_e64:
290 case AMDGPU::V_SUB_U16_e32:
291 case AMDGPU::V_SUBREV_U16_e64:
292 case AMDGPU::V_SUBREV_U16_e32:
293 case AMDGPU::V_MUL_LO_U16_e64:
294 case AMDGPU::V_MUL_LO_U16_e32:
295 case AMDGPU::V_ADD_F16_e64:
296 case AMDGPU::V_ADD_F16_e32:
297 case AMDGPU::V_SUB_F16_e64:
298 case AMDGPU::V_SUB_F16_e32:
299 case AMDGPU::V_SUBREV_F16_e64:
300 case AMDGPU::V_SUBREV_F16_e32:
301 case AMDGPU::V_MUL_F16_e64:
302 case AMDGPU::V_MUL_F16_e32:
303 case AMDGPU::V_MAX_F16_e64:
304 case AMDGPU::V_MAX_F16_e32:
305 case AMDGPU::V_MIN_F16_e64:
306 case AMDGPU::V_MIN_F16_e32:
307 case AMDGPU::V_MAX_U16_e64:
308 case AMDGPU::V_MAX_U16_e32:
309 case AMDGPU::V_MIN_U16_e64:
310 case AMDGPU::V_MIN_U16_e32:
311 case AMDGPU::V_MAX_I16_e64:
312 case AMDGPU::V_MAX_I16_e32:
313 case AMDGPU::V_MIN_I16_e64:
314 case AMDGPU::V_MIN_I16_e32:
315 case AMDGPU::V_MAD_F16_e64:
316 case AMDGPU::V_MAD_U16_e64:
317 case AMDGPU::V_MAD_I16_e64:
318 case AMDGPU::V_FMA_F16_e64:
319 case AMDGPU::V_DIV_FIXUP_F16_e64:
320 // On gfx10, all 16-bit instructions preserve the high bits.
322 case AMDGPU::V_MADAK_F16:
323 case AMDGPU::V_MADMK_F16:
324 case AMDGPU::V_MAC_F16_e64:
325 case AMDGPU::V_MAC_F16_e32:
326 case AMDGPU::V_FMAMK_F16:
327 case AMDGPU::V_FMAAK_F16:
328 case AMDGPU::V_FMAC_F16_e64:
329 case AMDGPU::V_FMAC_F16_e32:
330 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
331 // instructions maintain the legacy behavior of 0ing. Some instructions
332 // changed to preserving the high bits.
334 case AMDGPU::V_MAD_MIXLO_F16:
335 case AMDGPU::V_MAD_MIXHI_F16:
336 default:
337 return false;
338 }
339}
340
341// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
342// allows the given function to achieve an occupancy of NWaves waves per
343// SIMD / EU, taking into account only the function's *maximum* workgroup size.
344unsigned
346 const Function &F) const {
347 const unsigned WaveSize = getWavefrontSize();
348 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
349 const unsigned WavesPerWorkgroup =
350 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
351
352 const unsigned WorkGroupsPerCU =
353 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
354
355 return getLocalMemorySize() / WorkGroupsPerCU;
356}
357
358// FIXME: Should return min,max range.
359//
360// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
361// be achieved when only the given function is running on the machine; and
362// taking into account the overall number of wave slots, the (maximum) workgroup
363// size, and the per-workgroup LDS allocation size.
365 const Function &F) const {
366 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
367 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
368 if (!MaxWorkGroupsPerCu)
369 return 0;
370
371 const unsigned WaveSize = getWavefrontSize();
372
373 // FIXME: Do we need to account for alignment requirement of LDS rounding the
374 // size up?
375 // Compute restriction based on LDS usage
376 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
377
378 // This can be queried with more LDS than is possible, so just assume the
379 // worst.
380 if (NumGroups == 0)
381 return 1;
382
383 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
384
385 // Round to the number of waves per CU.
386 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
387 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
388
389 // Number of waves per EU (SIMD).
390 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
391
392 // Clamp to the maximum possible number of waves.
393 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
394
395 // FIXME: Needs to be a multiple of the group size?
396 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
397
398 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
399 "computed invalid occupancy");
400 return MaxWaves;
401}
402
403unsigned
405 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
406 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
407}
408
409std::pair<unsigned, unsigned>
411 switch (CC) {
418 return std::pair(1, getWavefrontSize());
419 default:
420 return std::pair(1u, getMaxFlatWorkGroupSize());
421 }
422}
423
424std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
425 const Function &F) const {
426 // Default minimum/maximum flat work group sizes.
427 std::pair<unsigned, unsigned> Default =
428 getDefaultFlatWorkGroupSize(F.getCallingConv());
429
430 // Requested minimum/maximum flat work group sizes.
431 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
432 F, "amdgpu-flat-work-group-size", Default);
433
434 // Make sure requested minimum is less than requested maximum.
435 if (Requested.first > Requested.second)
436 return Default;
437
438 // Make sure requested values do not violate subtarget's specifications.
439 if (Requested.first < getMinFlatWorkGroupSize())
440 return Default;
441 if (Requested.second > getMaxFlatWorkGroupSize())
442 return Default;
443
444 return Requested;
445}
446
447std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
448 std::pair<unsigned, unsigned> Requested,
449 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
450 // Default minimum/maximum number of waves per execution unit.
451 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
452
453 // If minimum/maximum flat work group sizes were explicitly requested using
454 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
455 // number of waves per execution unit to values implied by requested
456 // minimum/maximum flat work group sizes.
457 unsigned MinImpliedByFlatWorkGroupSize =
458 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
459 Default.first = MinImpliedByFlatWorkGroupSize;
460
461 // Make sure requested minimum is less than requested maximum.
462 if (Requested.second && Requested.first > Requested.second)
463 return Default;
464
465 // Make sure requested values do not violate subtarget's specifications.
466 if (Requested.first < getMinWavesPerEU() ||
467 Requested.second > getMaxWavesPerEU())
468 return Default;
469
470 // Make sure requested values are compatible with values implied by requested
471 // minimum/maximum flat work group sizes.
472 if (Requested.first < MinImpliedByFlatWorkGroupSize)
473 return Default;
474
475 return Requested;
476}
477
478std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
479 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
480 // Default minimum/maximum number of waves per execution unit.
481 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
482
483 // Requested minimum/maximum number of waves per execution unit.
484 std::pair<unsigned, unsigned> Requested =
485 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
486 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
487}
488
489static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490 auto Node = Kernel.getMetadata("reqd_work_group_size");
491 if (Node && Node->getNumOperands() == 3)
492 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
493 return std::numeric_limits<unsigned>::max();
494}
495
497 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
498}
499
501 unsigned Dimension) const {
502 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
503 if (ReqdSize != std::numeric_limits<unsigned>::max())
504 return ReqdSize - 1;
505 return getFlatWorkGroupSizes(Kernel).second - 1;
506}
507
509 for (int I = 0; I < 3; ++I) {
510 if (getMaxWorkitemID(Func, I) > 0)
511 return false;
512 }
513
514 return true;
515}
516
518 Function *Kernel = I->getParent()->getParent();
519 unsigned MinSize = 0;
520 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
521 bool IdQuery = false;
522
523 // If reqd_work_group_size is present it narrows value down.
524 if (auto *CI = dyn_cast<CallInst>(I)) {
525 const Function *F = CI->getCalledFunction();
526 if (F) {
527 unsigned Dim = UINT_MAX;
528 switch (F->getIntrinsicID()) {
529 case Intrinsic::amdgcn_workitem_id_x:
530 case Intrinsic::r600_read_tidig_x:
531 IdQuery = true;
532 [[fallthrough]];
533 case Intrinsic::r600_read_local_size_x:
534 Dim = 0;
535 break;
536 case Intrinsic::amdgcn_workitem_id_y:
537 case Intrinsic::r600_read_tidig_y:
538 IdQuery = true;
539 [[fallthrough]];
540 case Intrinsic::r600_read_local_size_y:
541 Dim = 1;
542 break;
543 case Intrinsic::amdgcn_workitem_id_z:
544 case Intrinsic::r600_read_tidig_z:
545 IdQuery = true;
546 [[fallthrough]];
547 case Intrinsic::r600_read_local_size_z:
548 Dim = 2;
549 break;
550 default:
551 break;
552 }
553
554 if (Dim <= 3) {
555 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
556 if (ReqdSize != std::numeric_limits<unsigned>::max())
557 MinSize = MaxSize = ReqdSize;
558 }
559 }
560 }
561
562 if (!MaxSize)
563 return false;
564
565 // Range metadata is [Lo, Hi). For ID query we need to pass max size
566 // as Hi. For size query we need to pass Hi + 1.
567 if (IdQuery)
568 MinSize = 0;
569 else
570 ++MaxSize;
571
572 APInt Lower{32, MinSize};
573 APInt Upper{32, MaxSize};
574 if (auto *CI = dyn_cast<CallBase>(I)) {
576 CI->addRangeRetAttr(Range);
577 } else {
578 MDBuilder MDB(I->getContext());
579 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
580 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
581 }
582 return true;
583}
584
586 assert(AMDGPU::isKernel(F.getCallingConv()));
587
588 // We don't allocate the segment if we know the implicit arguments weren't
589 // used, even if the ABI implies we need them.
590 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
591 return 0;
592
593 if (isMesaKernel(F))
594 return 16;
595
596 // Assume all implicit inputs are used by default
597 const Module *M = F.getParent();
598 unsigned NBytes =
600 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
601 NBytes);
602}
603
605 Align &MaxAlign) const {
606 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
607 F.getCallingConv() == CallingConv::SPIR_KERNEL);
608
609 const DataLayout &DL = F.getDataLayout();
610 uint64_t ExplicitArgBytes = 0;
611 MaxAlign = Align(1);
612
613 for (const Argument &Arg : F.args()) {
614 const bool IsByRef = Arg.hasByRefAttr();
615 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616 Align Alignment = DL.getValueOrABITypeAlignment(
617 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
618 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
619 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
620 MaxAlign = std::max(MaxAlign, Alignment);
621 }
622
623 return ExplicitArgBytes;
624}
625
627 Align &MaxAlign) const {
628 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
629 F.getCallingConv() != CallingConv::SPIR_KERNEL)
630 return 0;
631
632 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
633
634 unsigned ExplicitOffset = getExplicitKernelArgOffset();
635
636 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
637 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
638 if (ImplicitBytes != 0) {
639 const Align Alignment = getAlignmentForImplicitArgPtr();
640 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
641 MaxAlign = std::max(MaxAlign, Alignment);
642 }
643
644 // Being able to dereference past the end is useful for emitting scalar loads.
645 return alignTo(TotalSize, 4);
646}
647
651}
652
654 unsigned NumRegionInstrs) const {
655 // Track register pressure so the scheduler can try to decrease
656 // pressure once register usage is above the threshold defined by
657 // SIRegisterInfo::getRegPressureSetLimit()
658 Policy.ShouldTrackPressure = true;
659
660 // Enabling both top down and bottom up scheduling seems to give us less
661 // register spills than just using one of these approaches on its own.
662 Policy.OnlyTopDown = false;
663 Policy.OnlyBottomUp = false;
664
665 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
666 if (!enableSIScheduler())
667 Policy.ShouldTrackLaneMasks = true;
668}
669
671 if (isWave32()) {
672 // Fix implicit $vcc operands after MIParser has verified that they match
673 // the instruction definitions.
674 for (auto &MBB : MF) {
675 for (auto &MI : MBB)
676 InstrInfo.fixImplicitOperands(MI);
677 }
678 }
679}
680
682 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
683}
684
687}
688
689bool GCNSubtarget::useAA() const { return UseAA; }
690
691unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
693 getGeneration());
694}
695
696unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
698}
699
700unsigned
701GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
703 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
704
705 if (HasFlatScratch || HasArchitectedFlatScratch) {
707 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
709 return 4; // FLAT_SCRATCH, VCC (in that order).
710 }
711
712 if (isXNACKEnabled())
713 return 4; // XNACK, VCC (in that order).
714 return 2; // VCC.
715}
716
720}
721
723 // In principle we do not need to reserve SGPR pair used for flat_scratch if
724 // we know flat instructions do not access the stack anywhere in the
725 // program. For now assume it's needed if we have flat instructions.
726 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
727 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
728}
729
730unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
731 unsigned NumSGPRs,
732 unsigned NumVGPRs) const {
733 unsigned Occupancy =
734 std::min(getMaxWavesPerEU(),
736 if (NumSGPRs)
737 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
738 if (NumVGPRs)
739 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
740 return Occupancy;
741}
742
744 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
745 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
746 // Compute maximum number of SGPRs function can use using default/requested
747 // minimum number of waves per execution unit.
748 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
749 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
750
751 // Check if maximum number of SGPRs was explicitly requested using
752 // "amdgpu-num-sgpr" attribute.
753 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
754 unsigned Requested =
755 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
756
757 // Make sure requested value does not violate subtarget's specifications.
758 if (Requested && (Requested <= ReservedNumSGPRs))
759 Requested = 0;
760
761 // If more SGPRs are required to support the input user/system SGPRs,
762 // increase to accommodate them.
763 //
764 // FIXME: This really ends up using the requested number of SGPRs + number
765 // of reserved special registers in total. Theoretically you could re-use
766 // the last input registers for these special registers, but this would
767 // require a lot of complexity to deal with the weird aliasing.
768 unsigned InputNumSGPRs = PreloadedSGPRs;
769 if (Requested && Requested < InputNumSGPRs)
770 Requested = InputNumSGPRs;
771
772 // Make sure requested value is compatible with values implied by
773 // default/requested minimum/maximum number of waves per execution unit.
774 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
775 Requested = 0;
776 if (WavesPerEU.second &&
777 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
778 Requested = 0;
779
780 if (Requested)
781 MaxNumSGPRs = Requested;
782 }
783
784 if (hasSGPRInitBug())
786
787 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
788}
789
791 const Function &F = MF.getFunction();
795}
796
797static unsigned getMaxNumPreloadedSGPRs() {
798 using USI = GCNUserSGPRUsageInfo;
799 // Max number of user SGPRs
800 const unsigned MaxUserSGPRs =
801 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
802 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
803 USI::getNumUserSGPRForField(USI::QueuePtrID) +
804 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
805 USI::getNumUserSGPRForField(USI::DispatchIdID) +
806 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
807 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
808
809 // Max number of system SGPRs
810 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
811 1 + // WorkGroupIDY
812 1 + // WorkGroupIDZ
813 1 + // WorkGroupInfo
814 1; // private segment wave byte offset
815
816 // Max number of synthetic SGPRs
817 const unsigned SyntheticSGPRs = 1; // LDSKernelId
818
819 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
820}
821
825}
826
828 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
829 // Compute maximum number of VGPRs function can use using default/requested
830 // minimum number of waves per execution unit.
831 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
832
833 // Check if maximum number of VGPRs was explicitly requested using
834 // "amdgpu-num-vgpr" attribute.
835 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
836 unsigned Requested =
837 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
838
839 if (hasGFX90AInsts())
840 Requested *= 2;
841
842 // Make sure requested value is compatible with values implied by
843 // default/requested minimum/maximum number of waves per execution unit.
844 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
845 Requested = 0;
846 if (WavesPerEU.second &&
847 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
848 Requested = 0;
849
850 if (Requested)
851 MaxNumVGPRs = Requested;
852 }
853
854 return MaxNumVGPRs;
855}
856
859}
860
862 const Function &F = MF.getFunction();
864 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
865}
866
868 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
869 const TargetSchedModel *SchedModel) const {
870 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
871 !Def->isInstr() || !Use->isInstr())
872 return;
873
874 MachineInstr *DefI = Def->getInstr();
875 MachineInstr *UseI = Use->getInstr();
876
877 if (DefI->isBundle()) {
879 auto Reg = Dep.getReg();
882 unsigned Lat = 0;
883 for (++I; I != E && I->isBundledWithPred(); ++I) {
884 if (I->modifiesRegister(Reg, TRI))
885 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
886 else if (Lat)
887 --Lat;
888 }
889 Dep.setLatency(Lat);
890 } else if (UseI->isBundle()) {
892 auto Reg = Dep.getReg();
895 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
896 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
897 if (I->readsRegister(Reg, TRI))
898 break;
899 --Lat;
900 }
901 Dep.setLatency(Lat);
902 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
903 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
904 // implicit operands which come from the MCInstrDesc, which can fool
905 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
906 // pseudo operands.
908 DefI, DefOpIdx, UseI, UseOpIdx));
909 }
910}
911
912namespace {
913struct FillMFMAShadowMutation : ScheduleDAGMutation {
914 const SIInstrInfo *TII;
915
916 ScheduleDAGMI *DAG;
917
918 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
919
920 bool isSALU(const SUnit *SU) const {
921 const MachineInstr *MI = SU->getInstr();
922 return MI && TII->isSALU(*MI) && !MI->isTerminator();
923 }
924
925 bool isVALU(const SUnit *SU) const {
926 const MachineInstr *MI = SU->getInstr();
927 return MI && TII->isVALU(*MI);
928 }
929
930 // Link as many SALU instructions in chain as possible. Return the size
931 // of the chain. Links up to MaxChain instructions.
932 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
933 SmallPtrSetImpl<SUnit *> &Visited) const {
934 SmallVector<SUnit *, 8> Worklist({To});
935 unsigned Linked = 0;
936
937 while (!Worklist.empty() && MaxChain-- > 0) {
938 SUnit *SU = Worklist.pop_back_val();
939 if (!Visited.insert(SU).second)
940 continue;
941
942 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
943 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
944
945 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
946 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
947 ++Linked;
948
949 for (SDep &SI : From->Succs) {
950 SUnit *SUv = SI.getSUnit();
951 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
952 DAG->canAddEdge(SUv, SU))
953 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
954 }
955
956 for (SDep &SI : SU->Succs) {
957 SUnit *Succ = SI.getSUnit();
958 if (Succ != SU && isSALU(Succ))
959 Worklist.push_back(Succ);
960 }
961 }
962
963 return Linked;
964 }
965
966 void apply(ScheduleDAGInstrs *DAGInstrs) override {
967 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
968 if (!ST.hasMAIInsts())
969 return;
970 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
971 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
972 if (!TSchedModel || DAG->SUnits.empty())
973 return;
974
975 // Scan for MFMA long latency instructions and try to add a dependency
976 // of available SALU instructions to give them a chance to fill MFMA
977 // shadow. That is desirable to fill MFMA shadow with SALU instructions
978 // rather than VALU to prevent power consumption bursts and throttle.
979 auto LastSALU = DAG->SUnits.begin();
980 auto E = DAG->SUnits.end();
982 for (SUnit &SU : DAG->SUnits) {
983 MachineInstr &MAI = *SU.getInstr();
984 if (!TII->isMAI(MAI) ||
985 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
986 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
987 continue;
988
989 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
990
991 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
992 dbgs() << "Need " << Lat
993 << " instructions to cover latency.\n");
994
995 // Find up to Lat independent scalar instructions as early as
996 // possible such that they can be scheduled after this MFMA.
997 for ( ; Lat && LastSALU != E; ++LastSALU) {
998 if (Visited.count(&*LastSALU))
999 continue;
1000
1001 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
1002 !DAG->canAddEdge(&*LastSALU, &SU))
1003 continue;
1004
1005 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1006 }
1007 }
1008 }
1009};
1010} // namespace
1011
1013 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1014 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1015}
1016
1017std::unique_ptr<ScheduleDAGMutation>
1019 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1020 : nullptr;
1021}
1022
1025 return 0; // Not MIMG encoding.
1026
1027 if (NSAThreshold.getNumOccurrences() > 0)
1028 return std::max(NSAThreshold.getValue(), 2u);
1029
1031 "amdgpu-nsa-threshold", -1);
1032 if (Value > 0)
1033 return std::max(Value, 2);
1034
1035 return 3;
1036}
1037
1040 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1041 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
1042}
1043
1045 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1046 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1047 return static_cast<const AMDGPUSubtarget &>(
1048 TM.getSubtarget<R600Subtarget>(F));
1049}
1050
1052 const GCNSubtarget &ST)
1053 : ST(ST) {
1054 const CallingConv::ID CC = F.getCallingConv();
1055 const bool IsKernel =
1057 // FIXME: Should have analysis or something rather than attribute to detect
1058 // calls.
1059 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1060 // FIXME: This attribute is a hack, we just need an analysis on the function
1061 // to look for allocas.
1062 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1063
1064 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1065 KernargSegmentPtr = true;
1066
1067 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1068 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1069 PrivateSegmentBuffer = true;
1070 else if (ST.isMesaGfxShader(F))
1071 ImplicitBufferPtr = true;
1072
1073 if (!AMDGPU::isGraphics(CC)) {
1074 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1075 DispatchPtr = true;
1076
1077 // FIXME: Can this always be disabled with < COv5?
1078 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1079 QueuePtr = true;
1080
1081 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1082 DispatchID = true;
1083 }
1084
1085 // TODO: This could be refined a lot. The attribute is a poor way of
1086 // detecting calls or stack objects that may require it before argument
1087 // lowering.
1088 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1089 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1090 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1091 !ST.flatScratchIsArchitected()) {
1092 FlatScratchInit = true;
1093 }
1094
1096 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1097
1100
1101 if (hasDispatchPtr())
1102 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1103
1104 if (hasQueuePtr())
1105 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1106
1108 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1109
1110 if (hasDispatchID())
1111 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1112
1113 if (hasFlatScratchInit())
1114 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1115
1117 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
1118}
1119
1121 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1122 NumKernargPreloadSGPRs += NumSGPRs;
1123 NumUsedUserSGPRs += NumSGPRs;
1124}
1125
1127 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1128}
1129
1132 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
1133}
@ HasCalls
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
if(PassOpts->AAPipeline)
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:781
bool hasFlat() const
Definition: GCNSubtarget.h:385
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:66
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:311
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:619
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
Definition: GCNSubtarget.h:989
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:993
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:204
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:605
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasFP64() const
Definition: GCNSubtarget.h:361
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:62
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Value.h:565
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:97
Metadata node.
Definition: Metadata.h:1069
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool isBundle() const
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1441
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:132
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1309
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1856
@ Default
The result values are uniform if and only if all operands are uniform.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.