LLVM 19.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "R600Subtarget.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/IntrinsicsR600.h"
31#include "llvm/IR/MDBuilder.h"
33#include <algorithm>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "amdgpu-subtarget"
38
39#define GET_SUBTARGETINFO_TARGET_DESC
40#define GET_SUBTARGETINFO_CTOR
41#define AMDGPUSubtarget GCNSubtarget
42#include "AMDGPUGenSubtargetInfo.inc"
43#undef AMDGPUSubtarget
44
46 "amdgpu-enable-power-sched",
47 cl::desc("Enable scheduling to minimize mAI power bursts"),
48 cl::init(false));
49
51 "amdgpu-vgpr-index-mode",
52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53 cl::init(false));
54
55static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
56 cl::desc("Enable the use of AA during codegen."),
57 cl::init(true));
58
59static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60 cl::desc("Number of addresses from which to enable MIMG NSA."),
62
64
67 StringRef GPU, StringRef FS) {
68 // Determine default and user-specified characteristics
69 //
70 // We want to be able to turn these off, but making this a subtarget feature
71 // for SI has the unhelpful behavior that it unsets everything else if you
72 // disable it.
73 //
74 // Similarly we want enable-prt-strict-null to be on by default and not to
75 // unset everything else if it is disabled
76
77 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
78
79 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
80 if (isAmdHsaOS())
81 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
82
83 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
84
85 // Disable mutually exclusive bits.
86 if (FS.contains_insensitive("+wavefrontsize")) {
87 if (!FS.contains_insensitive("wavefrontsize16"))
88 FullFS += "-wavefrontsize16,";
89 if (!FS.contains_insensitive("wavefrontsize32"))
90 FullFS += "-wavefrontsize32,";
91 if (!FS.contains_insensitive("wavefrontsize64"))
92 FullFS += "-wavefrontsize64,";
93 }
94
95 FullFS += FS;
96
97 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
98
99 // Implement the "generic" processors, which acts as the default when no
100 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101 // the first amdgcn target that supports flat addressing. Other OSes defaults
102 // to the first amdgcn target.
106 }
107
108 // We don't support FP64 for EG/NI atm.
110
111 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
112 // support flat operations, otherwise they cannot access a 64-bit global
113 // address space
114 assert(hasAddr64() || hasFlat());
115 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
116 // that do not support ADDR64 variants of MUBUF instructions. Such targets
117 // cannot use a 64 bit offset with a MUBUF instruction to access the global
118 // address space
119 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
120 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121 FlatForGlobal = true;
122 }
123 // Unless +-flat-for-global is specified, use MUBUF instructions for global
124 // address space access if flat operations are not available.
125 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
126 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
127 FlatForGlobal = false;
128 }
129
130 // Set defaults if needed.
131 if (MaxPrivateElementSize == 0)
133
134 if (LDSBankCount == 0)
135 LDSBankCount = 32;
136
137 if (TT.getArch() == Triple::amdgcn) {
138 if (LocalMemorySize == 0)
139 LocalMemorySize = 32768;
140
141 // Do something sensible for unspecified target.
143 HasMovrel = true;
144 }
145
147
148 if (AMDGPU::isGFX10Plus(*this) &&
149 !getFeatureBits().test(AMDGPU::FeatureCuMode))
150 LocalMemorySize *= 2;
151
152 // Don't crash on invalid devices.
153 if (WavefrontSizeLog2 == 0)
155
158
160
161 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
162 << TargetID.getXnackSetting() << '\n');
163 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
164 << TargetID.getSramEccSetting() << '\n');
165
166 return *this;
167}
168
170 LLVMContext &Ctx = F.getContext();
171 if (hasFeature(AMDGPU::FeatureWavefrontSize32) ==
172 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
174 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
175 }
176}
177
178AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
179
182}
183
185 const GCNTargetMachine &TM)
186 : // clang-format off
187 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
188 AMDGPUSubtarget(TT),
189 TargetTriple(TT),
190 TargetID(*this),
191 InstrItins(getInstrItineraryForCPU(GPU)),
192 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
193 TLInfo(TM, *this),
194 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
195 // clang-format on
198 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
199 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
200 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
201 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
202 InstSelector.reset(new AMDGPUInstructionSelector(
203 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
204}
205
206unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
207 if (getGeneration() < GFX10)
208 return 1;
209
210 switch (Opcode) {
211 case AMDGPU::V_LSHLREV_B64_e64:
212 case AMDGPU::V_LSHLREV_B64_gfx10:
213 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
214 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
215 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
216 case AMDGPU::V_LSHL_B64_e64:
217 case AMDGPU::V_LSHRREV_B64_e64:
218 case AMDGPU::V_LSHRREV_B64_gfx10:
219 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
220 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
221 case AMDGPU::V_LSHR_B64_e64:
222 case AMDGPU::V_ASHRREV_I64_e64:
223 case AMDGPU::V_ASHRREV_I64_gfx10:
224 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
225 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
226 case AMDGPU::V_ASHR_I64_e64:
227 return 1;
228 }
229
230 return 2;
231}
232
233/// This list was mostly derived from experimentation.
234bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
235 switch (Opcode) {
236 case AMDGPU::V_CVT_F16_F32_e32:
237 case AMDGPU::V_CVT_F16_F32_e64:
238 case AMDGPU::V_CVT_F16_U16_e32:
239 case AMDGPU::V_CVT_F16_U16_e64:
240 case AMDGPU::V_CVT_F16_I16_e32:
241 case AMDGPU::V_CVT_F16_I16_e64:
242 case AMDGPU::V_RCP_F16_e64:
243 case AMDGPU::V_RCP_F16_e32:
244 case AMDGPU::V_RSQ_F16_e64:
245 case AMDGPU::V_RSQ_F16_e32:
246 case AMDGPU::V_SQRT_F16_e64:
247 case AMDGPU::V_SQRT_F16_e32:
248 case AMDGPU::V_LOG_F16_e64:
249 case AMDGPU::V_LOG_F16_e32:
250 case AMDGPU::V_EXP_F16_e64:
251 case AMDGPU::V_EXP_F16_e32:
252 case AMDGPU::V_SIN_F16_e64:
253 case AMDGPU::V_SIN_F16_e32:
254 case AMDGPU::V_COS_F16_e64:
255 case AMDGPU::V_COS_F16_e32:
256 case AMDGPU::V_FLOOR_F16_e64:
257 case AMDGPU::V_FLOOR_F16_e32:
258 case AMDGPU::V_CEIL_F16_e64:
259 case AMDGPU::V_CEIL_F16_e32:
260 case AMDGPU::V_TRUNC_F16_e64:
261 case AMDGPU::V_TRUNC_F16_e32:
262 case AMDGPU::V_RNDNE_F16_e64:
263 case AMDGPU::V_RNDNE_F16_e32:
264 case AMDGPU::V_FRACT_F16_e64:
265 case AMDGPU::V_FRACT_F16_e32:
266 case AMDGPU::V_FREXP_MANT_F16_e64:
267 case AMDGPU::V_FREXP_MANT_F16_e32:
268 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
269 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
270 case AMDGPU::V_LDEXP_F16_e64:
271 case AMDGPU::V_LDEXP_F16_e32:
272 case AMDGPU::V_LSHLREV_B16_e64:
273 case AMDGPU::V_LSHLREV_B16_e32:
274 case AMDGPU::V_LSHRREV_B16_e64:
275 case AMDGPU::V_LSHRREV_B16_e32:
276 case AMDGPU::V_ASHRREV_I16_e64:
277 case AMDGPU::V_ASHRREV_I16_e32:
278 case AMDGPU::V_ADD_U16_e64:
279 case AMDGPU::V_ADD_U16_e32:
280 case AMDGPU::V_SUB_U16_e64:
281 case AMDGPU::V_SUB_U16_e32:
282 case AMDGPU::V_SUBREV_U16_e64:
283 case AMDGPU::V_SUBREV_U16_e32:
284 case AMDGPU::V_MUL_LO_U16_e64:
285 case AMDGPU::V_MUL_LO_U16_e32:
286 case AMDGPU::V_ADD_F16_e64:
287 case AMDGPU::V_ADD_F16_e32:
288 case AMDGPU::V_SUB_F16_e64:
289 case AMDGPU::V_SUB_F16_e32:
290 case AMDGPU::V_SUBREV_F16_e64:
291 case AMDGPU::V_SUBREV_F16_e32:
292 case AMDGPU::V_MUL_F16_e64:
293 case AMDGPU::V_MUL_F16_e32:
294 case AMDGPU::V_MAX_F16_e64:
295 case AMDGPU::V_MAX_F16_e32:
296 case AMDGPU::V_MIN_F16_e64:
297 case AMDGPU::V_MIN_F16_e32:
298 case AMDGPU::V_MAX_U16_e64:
299 case AMDGPU::V_MAX_U16_e32:
300 case AMDGPU::V_MIN_U16_e64:
301 case AMDGPU::V_MIN_U16_e32:
302 case AMDGPU::V_MAX_I16_e64:
303 case AMDGPU::V_MAX_I16_e32:
304 case AMDGPU::V_MIN_I16_e64:
305 case AMDGPU::V_MIN_I16_e32:
306 case AMDGPU::V_MAD_F16_e64:
307 case AMDGPU::V_MAD_U16_e64:
308 case AMDGPU::V_MAD_I16_e64:
309 case AMDGPU::V_FMA_F16_e64:
310 case AMDGPU::V_DIV_FIXUP_F16_e64:
311 // On gfx10, all 16-bit instructions preserve the high bits.
313 case AMDGPU::V_MADAK_F16:
314 case AMDGPU::V_MADMK_F16:
315 case AMDGPU::V_MAC_F16_e64:
316 case AMDGPU::V_MAC_F16_e32:
317 case AMDGPU::V_FMAMK_F16:
318 case AMDGPU::V_FMAAK_F16:
319 case AMDGPU::V_FMAC_F16_e64:
320 case AMDGPU::V_FMAC_F16_e32:
321 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
322 // instructions maintain the legacy behavior of 0ing. Some instructions
323 // changed to preserving the high bits.
325 case AMDGPU::V_MAD_MIXLO_F16:
326 case AMDGPU::V_MAD_MIXHI_F16:
327 default:
328 return false;
329 }
330}
331
332// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
333// allows the given function to achieve an occupancy of NWaves waves per
334// SIMD / EU, taking into account only the function's *maximum* workgroup size.
335unsigned
337 const Function &F) const {
338 const unsigned WaveSize = getWavefrontSize();
339 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
340 const unsigned WavesPerWorkgroup =
341 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
342
343 const unsigned WorkGroupsPerCU =
344 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
345
346 return getLocalMemorySize() / WorkGroupsPerCU;
347}
348
349// FIXME: Should return min,max range.
350//
351// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
352// be achieved when only the given function is running on the machine; and
353// taking into account the overall number of wave slots, the (maximum) workgroup
354// size, and the per-workgroup LDS allocation size.
356 const Function &F) const {
357 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
358 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
359 if (!MaxWorkGroupsPerCu)
360 return 0;
361
362 const unsigned WaveSize = getWavefrontSize();
363
364 // FIXME: Do we need to account for alignment requirement of LDS rounding the
365 // size up?
366 // Compute restriction based on LDS usage
367 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
368
369 // This can be queried with more LDS than is possible, so just assume the
370 // worst.
371 if (NumGroups == 0)
372 return 1;
373
374 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
375
376 // Round to the number of waves per CU.
377 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
378 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
379
380 // Number of waves per EU (SIMD).
381 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
382
383 // Clamp to the maximum possible number of waves.
384 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
385
386 // FIXME: Needs to be a multiple of the group size?
387 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
388
389 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
390 "computed invalid occupancy");
391 return MaxWaves;
392}
393
394unsigned
396 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
397 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
398}
399
400std::pair<unsigned, unsigned>
402 switch (CC) {
409 return std::pair(1, getWavefrontSize());
410 default:
411 return std::pair(1u, getMaxFlatWorkGroupSize());
412 }
413}
414
415std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
416 const Function &F) const {
417 // Default minimum/maximum flat work group sizes.
418 std::pair<unsigned, unsigned> Default =
419 getDefaultFlatWorkGroupSize(F.getCallingConv());
420
421 // Requested minimum/maximum flat work group sizes.
422 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
423 F, "amdgpu-flat-work-group-size", Default);
424
425 // Make sure requested minimum is less than requested maximum.
426 if (Requested.first > Requested.second)
427 return Default;
428
429 // Make sure requested values do not violate subtarget's specifications.
430 if (Requested.first < getMinFlatWorkGroupSize())
431 return Default;
432 if (Requested.second > getMaxFlatWorkGroupSize())
433 return Default;
434
435 return Requested;
436}
437
438std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
439 std::pair<unsigned, unsigned> Requested,
440 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
441 // Default minimum/maximum number of waves per execution unit.
442 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
443
444 // If minimum/maximum flat work group sizes were explicitly requested using
445 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
446 // number of waves per execution unit to values implied by requested
447 // minimum/maximum flat work group sizes.
448 unsigned MinImpliedByFlatWorkGroupSize =
449 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
450 Default.first = MinImpliedByFlatWorkGroupSize;
451
452 // Make sure requested minimum is less than requested maximum.
453 if (Requested.second && Requested.first > Requested.second)
454 return Default;
455
456 // Make sure requested values do not violate subtarget's specifications.
457 if (Requested.first < getMinWavesPerEU() ||
458 Requested.second > getMaxWavesPerEU())
459 return Default;
460
461 // Make sure requested values are compatible with values implied by requested
462 // minimum/maximum flat work group sizes.
463 if (Requested.first < MinImpliedByFlatWorkGroupSize)
464 return Default;
465
466 return Requested;
467}
468
469std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
470 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
471 // Default minimum/maximum number of waves per execution unit.
472 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
473
474 // Requested minimum/maximum number of waves per execution unit.
475 std::pair<unsigned, unsigned> Requested =
476 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
477 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
478}
479
480static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
481 auto Node = Kernel.getMetadata("reqd_work_group_size");
482 if (Node && Node->getNumOperands() == 3)
483 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
484 return std::numeric_limits<unsigned>::max();
485}
486
488 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
489}
490
492 unsigned Dimension) const {
493 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
494 if (ReqdSize != std::numeric_limits<unsigned>::max())
495 return ReqdSize - 1;
496 return getFlatWorkGroupSizes(Kernel).second - 1;
497}
498
500 for (int I = 0; I < 3; ++I) {
501 if (getMaxWorkitemID(Func, I) > 0)
502 return false;
503 }
504
505 return true;
506}
507
509 Function *Kernel = I->getParent()->getParent();
510 unsigned MinSize = 0;
511 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
512 bool IdQuery = false;
513
514 // If reqd_work_group_size is present it narrows value down.
515 if (auto *CI = dyn_cast<CallInst>(I)) {
516 const Function *F = CI->getCalledFunction();
517 if (F) {
518 unsigned Dim = UINT_MAX;
519 switch (F->getIntrinsicID()) {
520 case Intrinsic::amdgcn_workitem_id_x:
521 case Intrinsic::r600_read_tidig_x:
522 IdQuery = true;
523 [[fallthrough]];
524 case Intrinsic::r600_read_local_size_x:
525 Dim = 0;
526 break;
527 case Intrinsic::amdgcn_workitem_id_y:
528 case Intrinsic::r600_read_tidig_y:
529 IdQuery = true;
530 [[fallthrough]];
531 case Intrinsic::r600_read_local_size_y:
532 Dim = 1;
533 break;
534 case Intrinsic::amdgcn_workitem_id_z:
535 case Intrinsic::r600_read_tidig_z:
536 IdQuery = true;
537 [[fallthrough]];
538 case Intrinsic::r600_read_local_size_z:
539 Dim = 2;
540 break;
541 default:
542 break;
543 }
544
545 if (Dim <= 3) {
546 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
547 if (ReqdSize != std::numeric_limits<unsigned>::max())
548 MinSize = MaxSize = ReqdSize;
549 }
550 }
551 }
552
553 if (!MaxSize)
554 return false;
555
556 // Range metadata is [Lo, Hi). For ID query we need to pass max size
557 // as Hi. For size query we need to pass Hi + 1.
558 if (IdQuery)
559 MinSize = 0;
560 else
561 ++MaxSize;
562
563 APInt Lower{32, MinSize};
564 APInt Upper{32, MaxSize};
565 if (auto *CI = dyn_cast<CallBase>(I)) {
567 CI->addRangeRetAttr(Range);
568 } else {
569 MDBuilder MDB(I->getContext());
570 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
571 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
572 }
573 return true;
574}
575
577 assert(AMDGPU::isKernel(F.getCallingConv()));
578
579 // We don't allocate the segment if we know the implicit arguments weren't
580 // used, even if the ABI implies we need them.
581 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
582 return 0;
583
584 if (isMesaKernel(F))
585 return 16;
586
587 // Assume all implicit inputs are used by default
588 const Module *M = F.getParent();
589 unsigned NBytes =
591 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
592 NBytes);
593}
594
596 Align &MaxAlign) const {
597 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
598 F.getCallingConv() == CallingConv::SPIR_KERNEL);
599
600 const DataLayout &DL = F.getDataLayout();
601 uint64_t ExplicitArgBytes = 0;
602 MaxAlign = Align(1);
603
604 for (const Argument &Arg : F.args()) {
605 const bool IsByRef = Arg.hasByRefAttr();
606 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
607 Align Alignment = DL.getValueOrABITypeAlignment(
608 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
609 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
610 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
611 MaxAlign = std::max(MaxAlign, Alignment);
612 }
613
614 return ExplicitArgBytes;
615}
616
618 Align &MaxAlign) const {
619 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
620 F.getCallingConv() != CallingConv::SPIR_KERNEL)
621 return 0;
622
623 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
624
625 unsigned ExplicitOffset = getExplicitKernelArgOffset();
626
627 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
628 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
629 if (ImplicitBytes != 0) {
630 const Align Alignment = getAlignmentForImplicitArgPtr();
631 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
632 MaxAlign = std::max(MaxAlign, Alignment);
633 }
634
635 // Being able to dereference past the end is useful for emitting scalar loads.
636 return alignTo(TotalSize, 4);
637}
638
642}
643
645 unsigned NumRegionInstrs) const {
646 // Track register pressure so the scheduler can try to decrease
647 // pressure once register usage is above the threshold defined by
648 // SIRegisterInfo::getRegPressureSetLimit()
649 Policy.ShouldTrackPressure = true;
650
651 // Enabling both top down and bottom up scheduling seems to give us less
652 // register spills than just using one of these approaches on its own.
653 Policy.OnlyTopDown = false;
654 Policy.OnlyBottomUp = false;
655
656 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
657 if (!enableSIScheduler())
658 Policy.ShouldTrackLaneMasks = true;
659}
660
662 if (isWave32()) {
663 // Fix implicit $vcc operands after MIParser has verified that they match
664 // the instruction definitions.
665 for (auto &MBB : MF) {
666 for (auto &MI : MBB)
667 InstrInfo.fixImplicitOperands(MI);
668 }
669 }
670}
671
673 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
674}
675
678}
679
680bool GCNSubtarget::useAA() const { return UseAA; }
681
682unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
684 getGeneration());
685}
686
687unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
689}
690
691unsigned
692GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
694 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
695
696 if (HasFlatScratch || HasArchitectedFlatScratch) {
698 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
700 return 4; // FLAT_SCRATCH, VCC (in that order).
701 }
702
703 if (isXNACKEnabled())
704 return 4; // XNACK, VCC (in that order).
705 return 2; // VCC.
706}
707
711}
712
714 // In principle we do not need to reserve SGPR pair used for flat_scratch if
715 // we know flat instructions do not access the stack anywhere in the
716 // program. For now assume it's needed if we have flat instructions.
717 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
718 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
719}
720
721unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
722 unsigned NumSGPRs,
723 unsigned NumVGPRs) const {
724 unsigned Occupancy =
725 std::min(getMaxWavesPerEU(),
727 if (NumSGPRs)
728 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
729 if (NumVGPRs)
730 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
731 return Occupancy;
732}
733
735 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
736 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
737 // Compute maximum number of SGPRs function can use using default/requested
738 // minimum number of waves per execution unit.
739 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
740 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
741
742 // Check if maximum number of SGPRs was explicitly requested using
743 // "amdgpu-num-sgpr" attribute.
744 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
745 unsigned Requested =
746 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
747
748 // Make sure requested value does not violate subtarget's specifications.
749 if (Requested && (Requested <= ReservedNumSGPRs))
750 Requested = 0;
751
752 // If more SGPRs are required to support the input user/system SGPRs,
753 // increase to accommodate them.
754 //
755 // FIXME: This really ends up using the requested number of SGPRs + number
756 // of reserved special registers in total. Theoretically you could re-use
757 // the last input registers for these special registers, but this would
758 // require a lot of complexity to deal with the weird aliasing.
759 unsigned InputNumSGPRs = PreloadedSGPRs;
760 if (Requested && Requested < InputNumSGPRs)
761 Requested = InputNumSGPRs;
762
763 // Make sure requested value is compatible with values implied by
764 // default/requested minimum/maximum number of waves per execution unit.
765 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
766 Requested = 0;
767 if (WavesPerEU.second &&
768 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
769 Requested = 0;
770
771 if (Requested)
772 MaxNumSGPRs = Requested;
773 }
774
775 if (hasSGPRInitBug())
777
778 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
779}
780
782 const Function &F = MF.getFunction();
786}
787
788static unsigned getMaxNumPreloadedSGPRs() {
789 using USI = GCNUserSGPRUsageInfo;
790 // Max number of user SGPRs
791 const unsigned MaxUserSGPRs =
792 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
793 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
794 USI::getNumUserSGPRForField(USI::QueuePtrID) +
795 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
796 USI::getNumUserSGPRForField(USI::DispatchIdID) +
797 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
798 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
799
800 // Max number of system SGPRs
801 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
802 1 + // WorkGroupIDY
803 1 + // WorkGroupIDZ
804 1 + // WorkGroupInfo
805 1; // private segment wave byte offset
806
807 // Max number of synthetic SGPRs
808 const unsigned SyntheticSGPRs = 1; // LDSKernelId
809
810 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
811}
812
816}
817
819 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
820 // Compute maximum number of VGPRs function can use using default/requested
821 // minimum number of waves per execution unit.
822 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
823
824 // Check if maximum number of VGPRs was explicitly requested using
825 // "amdgpu-num-vgpr" attribute.
826 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
827 unsigned Requested =
828 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
829
830 if (hasGFX90AInsts())
831 Requested *= 2;
832
833 // Make sure requested value is compatible with values implied by
834 // default/requested minimum/maximum number of waves per execution unit.
835 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
836 Requested = 0;
837 if (WavesPerEU.second &&
838 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
839 Requested = 0;
840
841 if (Requested)
842 MaxNumVGPRs = Requested;
843 }
844
845 return MaxNumVGPRs;
846}
847
850}
851
853 const Function &F = MF.getFunction();
855 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
856}
857
859 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
860 const TargetSchedModel *SchedModel) const {
861 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
862 !Def->isInstr() || !Use->isInstr())
863 return;
864
865 MachineInstr *DefI = Def->getInstr();
866 MachineInstr *UseI = Use->getInstr();
867
868 if (DefI->isBundle()) {
870 auto Reg = Dep.getReg();
873 unsigned Lat = 0;
874 for (++I; I != E && I->isBundledWithPred(); ++I) {
875 if (I->modifiesRegister(Reg, TRI))
876 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
877 else if (Lat)
878 --Lat;
879 }
880 Dep.setLatency(Lat);
881 } else if (UseI->isBundle()) {
883 auto Reg = Dep.getReg();
886 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
887 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
888 if (I->readsRegister(Reg, TRI))
889 break;
890 --Lat;
891 }
892 Dep.setLatency(Lat);
893 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
894 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
895 // implicit operands which come from the MCInstrDesc, which can fool
896 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
897 // pseudo operands.
899 DefI, DefOpIdx, UseI, UseOpIdx));
900 }
901}
902
903namespace {
904struct FillMFMAShadowMutation : ScheduleDAGMutation {
905 const SIInstrInfo *TII;
906
907 ScheduleDAGMI *DAG;
908
909 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
910
911 bool isSALU(const SUnit *SU) const {
912 const MachineInstr *MI = SU->getInstr();
913 return MI && TII->isSALU(*MI) && !MI->isTerminator();
914 }
915
916 bool isVALU(const SUnit *SU) const {
917 const MachineInstr *MI = SU->getInstr();
918 return MI && TII->isVALU(*MI);
919 }
920
921 // Link as many SALU instructions in chain as possible. Return the size
922 // of the chain. Links up to MaxChain instructions.
923 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
924 SmallPtrSetImpl<SUnit *> &Visited) const {
925 SmallVector<SUnit *, 8> Worklist({To});
926 unsigned Linked = 0;
927
928 while (!Worklist.empty() && MaxChain-- > 0) {
929 SUnit *SU = Worklist.pop_back_val();
930 if (!Visited.insert(SU).second)
931 continue;
932
933 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
934 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
935
936 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
937 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
938 ++Linked;
939
940 for (SDep &SI : From->Succs) {
941 SUnit *SUv = SI.getSUnit();
942 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
943 DAG->canAddEdge(SUv, SU))
944 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
945 }
946
947 for (SDep &SI : SU->Succs) {
948 SUnit *Succ = SI.getSUnit();
949 if (Succ != SU && isSALU(Succ))
950 Worklist.push_back(Succ);
951 }
952 }
953
954 return Linked;
955 }
956
957 void apply(ScheduleDAGInstrs *DAGInstrs) override {
958 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
959 if (!ST.hasMAIInsts())
960 return;
961 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
962 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
963 if (!TSchedModel || DAG->SUnits.empty())
964 return;
965
966 // Scan for MFMA long latency instructions and try to add a dependency
967 // of available SALU instructions to give them a chance to fill MFMA
968 // shadow. That is desirable to fill MFMA shadow with SALU instructions
969 // rather than VALU to prevent power consumption bursts and throttle.
970 auto LastSALU = DAG->SUnits.begin();
971 auto E = DAG->SUnits.end();
973 for (SUnit &SU : DAG->SUnits) {
974 MachineInstr &MAI = *SU.getInstr();
975 if (!TII->isMAI(MAI) ||
976 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
977 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
978 continue;
979
980 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
981
982 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
983 dbgs() << "Need " << Lat
984 << " instructions to cover latency.\n");
985
986 // Find up to Lat independent scalar instructions as early as
987 // possible such that they can be scheduled after this MFMA.
988 for ( ; Lat && LastSALU != E; ++LastSALU) {
989 if (Visited.count(&*LastSALU))
990 continue;
991
992 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
993 !DAG->canAddEdge(&*LastSALU, &SU))
994 continue;
995
996 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
997 }
998 }
999 }
1000};
1001} // namespace
1002
1004 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1005 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1006}
1007
1008std::unique_ptr<ScheduleDAGMutation>
1010 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1011 : nullptr;
1012}
1013
1016 return 0; // Not MIMG encoding.
1017
1018 if (NSAThreshold.getNumOccurrences() > 0)
1019 return std::max(NSAThreshold.getValue(), 2u);
1020
1022 "amdgpu-nsa-threshold", -1);
1023 if (Value > 0)
1024 return std::max(Value, 2);
1025
1026 return 3;
1027}
1028
1031 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1032 else
1033 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1034}
1035
1037 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1038 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1039 else
1040 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1041}
1042
1044 const GCNSubtarget &ST)
1045 : ST(ST) {
1046 const CallingConv::ID CC = F.getCallingConv();
1047 const bool IsKernel =
1049 // FIXME: Should have analysis or something rather than attribute to detect
1050 // calls.
1051 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1052 // FIXME: This attribute is a hack, we just need an analysis on the function
1053 // to look for allocas.
1054 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1055
1056 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1057 KernargSegmentPtr = true;
1058
1059 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1060 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1061 PrivateSegmentBuffer = true;
1062 else if (ST.isMesaGfxShader(F))
1063 ImplicitBufferPtr = true;
1064
1065 if (!AMDGPU::isGraphics(CC)) {
1066 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1067 DispatchPtr = true;
1068
1069 // FIXME: Can this always be disabled with < COv5?
1070 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1071 QueuePtr = true;
1072
1073 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1074 DispatchID = true;
1075 }
1076
1077 // TODO: This could be refined a lot. The attribute is a poor way of
1078 // detecting calls or stack objects that may require it before argument
1079 // lowering.
1080 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1081 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1082 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1083 !ST.flatScratchIsArchitected()) {
1084 FlatScratchInit = true;
1085 }
1086
1088 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1089
1092
1093 if (hasDispatchPtr())
1094 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1095
1096 if (hasQueuePtr())
1097 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1098
1100 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1101
1102 if (hasDispatchID())
1103 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1104
1105 if (hasFlatScratchInit())
1106 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1107
1109 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
1110}
1111
1113 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1114 NumKernargPreloadSGPRs += NumSGPRs;
1115 NumUsedUserSGPRs += NumSGPRs;
1116}
1117
1119 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1120}
1121
1124 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
1125}
@ HasCalls
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
if(VerifyEach)
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
AMDGPUSubtarget(const Triple &TT)
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
Definition: APInt.h:77
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:728
bool hasFlat() const
Definition: GCNSubtarget.h:381
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:66
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:307
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:274
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:270
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:615
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
Definition: GCNSubtarget.h:969
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:973
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:201
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:313
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:601
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
Definition: GCNSubtarget.h:377
bool hasFP64() const
Definition: GCNSubtarget.h:357
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:62
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Value.h:565
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:97
Metadata node.
Definition: Metadata.h:1067
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool isBundle() const
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1394
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:132
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1309
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.