LLVM 17.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "R600Subtarget.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/IR/IntrinsicsR600.h"
29#include "llvm/IR/MDBuilder.h"
31#include <algorithm>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "amdgpu-subtarget"
36
37#define GET_SUBTARGETINFO_TARGET_DESC
38#define GET_SUBTARGETINFO_CTOR
39#define AMDGPUSubtarget GCNSubtarget
40#include "AMDGPUGenSubtargetInfo.inc"
41#undef AMDGPUSubtarget
42
44 "amdgpu-enable-power-sched",
45 cl::desc("Enable scheduling to minimize mAI power bursts"),
46 cl::init(false));
47
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51 cl::init(false));
52
53static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54 cl::desc("Enable the use of AA during codegen."),
55 cl::init(true));
56
57static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
58 cl::desc("Number of addresses from which to enable MIMG NSA."),
60
62
65 StringRef GPU, StringRef FS) {
66 // Determine default and user-specified characteristics
67 //
68 // We want to be able to turn these off, but making this a subtarget feature
69 // for SI has the unhelpful behavior that it unsets everything else if you
70 // disable it.
71 //
72 // Similarly we want enable-prt-strict-null to be on by default and not to
73 // unset everything else if it is disabled
74
75 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
76
77 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
78 if (isAmdHsaOS())
79 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
80
81 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
82
83 // Disable mutually exclusive bits.
84 if (FS.contains_insensitive("+wavefrontsize")) {
85 if (!FS.contains_insensitive("wavefrontsize16"))
86 FullFS += "-wavefrontsize16,";
87 if (!FS.contains_insensitive("wavefrontsize32"))
88 FullFS += "-wavefrontsize32,";
89 if (!FS.contains_insensitive("wavefrontsize64"))
90 FullFS += "-wavefrontsize64,";
91 }
92
93 FullFS += FS;
94
95 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
96
97 // Implement the "generic" processors, which acts as the default when no
98 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
99 // the first amdgcn target that supports flat addressing. Other OSes defaults
100 // to the first amdgcn target.
104 }
105
106 // We don't support FP64 for EG/NI atm.
108
109 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
110 // support flat operations, otherwise they cannot access a 64-bit global
111 // address space
112 assert(hasAddr64() || hasFlat());
113 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
114 // that do not support ADDR64 variants of MUBUF instructions. Such targets
115 // cannot use a 64 bit offset with a MUBUF instruction to access the global
116 // address space
117 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
118 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
119 FlatForGlobal = true;
120 }
121 // Unless +-flat-for-global is specified, use MUBUF instructions for global
122 // address space access if flat operations are not available.
123 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
124 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
125 FlatForGlobal = false;
126 }
127
128 // Set defaults if needed.
129 if (MaxPrivateElementSize == 0)
131
132 if (LDSBankCount == 0)
133 LDSBankCount = 32;
134
135 if (TT.getArch() == Triple::amdgcn) {
136 if (LocalMemorySize == 0)
137 LocalMemorySize = 32768;
138
139 // Do something sensible for unspecified target.
141 HasMovrel = true;
142 }
143
145
146 if (AMDGPU::isGFX10Plus(*this) &&
147 !getFeatureBits().test(AMDGPU::FeatureCuMode))
148 LocalMemorySize *= 2;
149
150 // Don't crash on invalid devices.
151 if (WavefrontSizeLog2 == 0)
153
156
158
159 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
160 << TargetID.getXnackSetting() << '\n');
161 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
162 << TargetID.getSramEccSetting() << '\n');
163
164 return *this;
165}
166
167AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
168
170 const GCNTargetMachine &TM)
171 : // clang-format off
172 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
173 AMDGPUSubtarget(TT),
174 TargetTriple(TT),
175 TargetID(*this),
176 InstrItins(getInstrItineraryForCPU(GPU)),
177 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
178 TLInfo(TM, *this),
179 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
180 // clang-format on
183 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
184 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
185 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
186 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
187 InstSelector.reset(new AMDGPUInstructionSelector(
188 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
189}
190
191unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
192 if (getGeneration() < GFX10)
193 return 1;
194
195 switch (Opcode) {
196 case AMDGPU::V_LSHLREV_B64_e64:
197 case AMDGPU::V_LSHLREV_B64_gfx10:
198 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
199 case AMDGPU::V_LSHL_B64_e64:
200 case AMDGPU::V_LSHRREV_B64_e64:
201 case AMDGPU::V_LSHRREV_B64_gfx10:
202 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
203 case AMDGPU::V_LSHR_B64_e64:
204 case AMDGPU::V_ASHRREV_I64_e64:
205 case AMDGPU::V_ASHRREV_I64_gfx10:
206 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
207 case AMDGPU::V_ASHR_I64_e64:
208 return 1;
209 }
210
211 return 2;
212}
213
214/// This list was mostly derived from experimentation.
215bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
216 switch (Opcode) {
217 case AMDGPU::V_CVT_F16_F32_e32:
218 case AMDGPU::V_CVT_F16_F32_e64:
219 case AMDGPU::V_CVT_F16_U16_e32:
220 case AMDGPU::V_CVT_F16_U16_e64:
221 case AMDGPU::V_CVT_F16_I16_e32:
222 case AMDGPU::V_CVT_F16_I16_e64:
223 case AMDGPU::V_RCP_F16_e64:
224 case AMDGPU::V_RCP_F16_e32:
225 case AMDGPU::V_RSQ_F16_e64:
226 case AMDGPU::V_RSQ_F16_e32:
227 case AMDGPU::V_SQRT_F16_e64:
228 case AMDGPU::V_SQRT_F16_e32:
229 case AMDGPU::V_LOG_F16_e64:
230 case AMDGPU::V_LOG_F16_e32:
231 case AMDGPU::V_EXP_F16_e64:
232 case AMDGPU::V_EXP_F16_e32:
233 case AMDGPU::V_SIN_F16_e64:
234 case AMDGPU::V_SIN_F16_e32:
235 case AMDGPU::V_COS_F16_e64:
236 case AMDGPU::V_COS_F16_e32:
237 case AMDGPU::V_FLOOR_F16_e64:
238 case AMDGPU::V_FLOOR_F16_e32:
239 case AMDGPU::V_CEIL_F16_e64:
240 case AMDGPU::V_CEIL_F16_e32:
241 case AMDGPU::V_TRUNC_F16_e64:
242 case AMDGPU::V_TRUNC_F16_e32:
243 case AMDGPU::V_RNDNE_F16_e64:
244 case AMDGPU::V_RNDNE_F16_e32:
245 case AMDGPU::V_FRACT_F16_e64:
246 case AMDGPU::V_FRACT_F16_e32:
247 case AMDGPU::V_FREXP_MANT_F16_e64:
248 case AMDGPU::V_FREXP_MANT_F16_e32:
249 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
250 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
251 case AMDGPU::V_LDEXP_F16_e64:
252 case AMDGPU::V_LDEXP_F16_e32:
253 case AMDGPU::V_LSHLREV_B16_e64:
254 case AMDGPU::V_LSHLREV_B16_e32:
255 case AMDGPU::V_LSHRREV_B16_e64:
256 case AMDGPU::V_LSHRREV_B16_e32:
257 case AMDGPU::V_ASHRREV_I16_e64:
258 case AMDGPU::V_ASHRREV_I16_e32:
259 case AMDGPU::V_ADD_U16_e64:
260 case AMDGPU::V_ADD_U16_e32:
261 case AMDGPU::V_SUB_U16_e64:
262 case AMDGPU::V_SUB_U16_e32:
263 case AMDGPU::V_SUBREV_U16_e64:
264 case AMDGPU::V_SUBREV_U16_e32:
265 case AMDGPU::V_MUL_LO_U16_e64:
266 case AMDGPU::V_MUL_LO_U16_e32:
267 case AMDGPU::V_ADD_F16_e64:
268 case AMDGPU::V_ADD_F16_e32:
269 case AMDGPU::V_SUB_F16_e64:
270 case AMDGPU::V_SUB_F16_e32:
271 case AMDGPU::V_SUBREV_F16_e64:
272 case AMDGPU::V_SUBREV_F16_e32:
273 case AMDGPU::V_MUL_F16_e64:
274 case AMDGPU::V_MUL_F16_e32:
275 case AMDGPU::V_MAX_F16_e64:
276 case AMDGPU::V_MAX_F16_e32:
277 case AMDGPU::V_MIN_F16_e64:
278 case AMDGPU::V_MIN_F16_e32:
279 case AMDGPU::V_MAX_U16_e64:
280 case AMDGPU::V_MAX_U16_e32:
281 case AMDGPU::V_MIN_U16_e64:
282 case AMDGPU::V_MIN_U16_e32:
283 case AMDGPU::V_MAX_I16_e64:
284 case AMDGPU::V_MAX_I16_e32:
285 case AMDGPU::V_MIN_I16_e64:
286 case AMDGPU::V_MIN_I16_e32:
287 case AMDGPU::V_MAD_F16_e64:
288 case AMDGPU::V_MAD_U16_e64:
289 case AMDGPU::V_MAD_I16_e64:
290 case AMDGPU::V_FMA_F16_e64:
291 case AMDGPU::V_DIV_FIXUP_F16_e64:
292 // On gfx10, all 16-bit instructions preserve the high bits.
294 case AMDGPU::V_MADAK_F16:
295 case AMDGPU::V_MADMK_F16:
296 case AMDGPU::V_MAC_F16_e64:
297 case AMDGPU::V_MAC_F16_e32:
298 case AMDGPU::V_FMAMK_F16:
299 case AMDGPU::V_FMAAK_F16:
300 case AMDGPU::V_FMAC_F16_e64:
301 case AMDGPU::V_FMAC_F16_e32:
302 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
303 // instructions maintain the legacy behavior of 0ing. Some instructions
304 // changed to preserving the high bits.
306 case AMDGPU::V_MAD_MIXLO_F16:
307 case AMDGPU::V_MAD_MIXHI_F16:
308 default:
309 return false;
310 }
311}
312
313// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
314// allows the given function to achieve an occupancy of NWaves waves per
315// SIMD / EU, taking into account only the function's *maximum* workgroup size.
316unsigned
318 const Function &F) const {
319 const unsigned WaveSize = getWavefrontSize();
320 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
321 const unsigned WavesPerWorkgroup =
322 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
323
324 const unsigned WorkGroupsPerCU =
325 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
326
327 return getLocalMemorySize() / WorkGroupsPerCU;
328}
329
330// FIXME: Should return min,max range.
331//
332// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
333// be achieved when only the given function is running on the machine; and
334// taking into account the overall number of wave slots, the (maximum) workgroup
335// size, and the per-workgroup LDS allocation size.
337 const Function &F) const {
338 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
339 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
340 if (!MaxWorkGroupsPerCu)
341 return 0;
342
343 const unsigned WaveSize = getWavefrontSize();
344
345 // FIXME: Do we need to account for alignment requirement of LDS rounding the
346 // size up?
347 // Compute restriction based on LDS usage
348 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
349
350 // This can be queried with more LDS than is possible, so just assume the
351 // worst.
352 if (NumGroups == 0)
353 return 1;
354
355 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
356
357 // Round to the number of waves per CU.
358 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
359 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
360
361 // Number of waves per EU (SIMD).
362 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
363
364 // Clamp to the maximum possible number of waves.
365 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
366
367 // FIXME: Needs to be a multiple of the group size?
368 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
369
370 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
371 "computed invalid occupancy");
372 return MaxWaves;
373}
374
375unsigned
377 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
378 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
379}
380
381std::pair<unsigned, unsigned>
383 switch (CC) {
390 return std::pair(1, getWavefrontSize());
391 default:
392 return std::pair(1u, getMaxFlatWorkGroupSize());
393 }
394}
395
396std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
397 const Function &F) const {
398 // Default minimum/maximum flat work group sizes.
399 std::pair<unsigned, unsigned> Default =
400 getDefaultFlatWorkGroupSize(F.getCallingConv());
401
402 // Requested minimum/maximum flat work group sizes.
403 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
404 F, "amdgpu-flat-work-group-size", Default);
405
406 // Make sure requested minimum is less than requested maximum.
407 if (Requested.first > Requested.second)
408 return Default;
409
410 // Make sure requested values do not violate subtarget's specifications.
411 if (Requested.first < getMinFlatWorkGroupSize())
412 return Default;
413 if (Requested.second > getMaxFlatWorkGroupSize())
414 return Default;
415
416 return Requested;
417}
418
419std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
420 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
421 // Default minimum/maximum number of waves per execution unit.
422 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
423
424 // If minimum/maximum flat work group sizes were explicitly requested using
425 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
426 // number of waves per execution unit to values implied by requested
427 // minimum/maximum flat work group sizes.
428 unsigned MinImpliedByFlatWorkGroupSize =
429 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
430 Default.first = MinImpliedByFlatWorkGroupSize;
431
432 // Requested minimum/maximum number of waves per execution unit.
433 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
434 F, "amdgpu-waves-per-eu", Default, true);
435
436 // Make sure requested minimum is less than requested maximum.
437 if (Requested.second && Requested.first > Requested.second)
438 return Default;
439
440 // Make sure requested values do not violate subtarget's specifications.
441 if (Requested.first < getMinWavesPerEU() ||
442 Requested.second > getMaxWavesPerEU())
443 return Default;
444
445 // Make sure requested values are compatible with values implied by requested
446 // minimum/maximum flat work group sizes.
447 if (Requested.first < MinImpliedByFlatWorkGroupSize)
448 return Default;
449
450 return Requested;
451}
452
453static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
454 auto Node = Kernel.getMetadata("reqd_work_group_size");
455 if (Node && Node->getNumOperands() == 3)
456 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
457 return std::numeric_limits<unsigned>::max();
458}
459
461 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
462}
463
465 unsigned Dimension) const {
466 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
467 if (ReqdSize != std::numeric_limits<unsigned>::max())
468 return ReqdSize - 1;
469 return getFlatWorkGroupSizes(Kernel).second - 1;
470}
471
473 Function *Kernel = I->getParent()->getParent();
474 unsigned MinSize = 0;
475 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
476 bool IdQuery = false;
477
478 // If reqd_work_group_size is present it narrows value down.
479 if (auto *CI = dyn_cast<CallInst>(I)) {
480 const Function *F = CI->getCalledFunction();
481 if (F) {
482 unsigned Dim = UINT_MAX;
483 switch (F->getIntrinsicID()) {
484 case Intrinsic::amdgcn_workitem_id_x:
485 case Intrinsic::r600_read_tidig_x:
486 IdQuery = true;
487 [[fallthrough]];
488 case Intrinsic::r600_read_local_size_x:
489 Dim = 0;
490 break;
491 case Intrinsic::amdgcn_workitem_id_y:
492 case Intrinsic::r600_read_tidig_y:
493 IdQuery = true;
494 [[fallthrough]];
495 case Intrinsic::r600_read_local_size_y:
496 Dim = 1;
497 break;
498 case Intrinsic::amdgcn_workitem_id_z:
499 case Intrinsic::r600_read_tidig_z:
500 IdQuery = true;
501 [[fallthrough]];
502 case Intrinsic::r600_read_local_size_z:
503 Dim = 2;
504 break;
505 default:
506 break;
507 }
508
509 if (Dim <= 3) {
510 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
511 if (ReqdSize != std::numeric_limits<unsigned>::max())
512 MinSize = MaxSize = ReqdSize;
513 }
514 }
515 }
516
517 if (!MaxSize)
518 return false;
519
520 // Range metadata is [Lo, Hi). For ID query we need to pass max size
521 // as Hi. For size query we need to pass Hi + 1.
522 if (IdQuery)
523 MinSize = 0;
524 else
525 ++MaxSize;
526
527 MDBuilder MDB(I->getContext());
528 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
529 APInt(32, MaxSize));
530 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
531 return true;
532}
533
535 assert(AMDGPU::isKernel(F.getCallingConv()));
536
537 // We don't allocate the segment if we know the implicit arguments weren't
538 // used, even if the ABI implies we need them.
539 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
540 return 0;
541
542 if (isMesaKernel(F))
543 return 16;
544
545 // Assume all implicit inputs are used by default
546 const Module *M = F.getParent();
547 unsigned NBytes = AMDGPU::getCodeObjectVersion(*M) >= 5 ? 256 : 56;
548 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
549 NBytes);
550}
551
553 Align &MaxAlign) const {
554 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
555 F.getCallingConv() == CallingConv::SPIR_KERNEL);
556
557 const DataLayout &DL = F.getParent()->getDataLayout();
558 uint64_t ExplicitArgBytes = 0;
559 MaxAlign = Align(1);
560
561 for (const Argument &Arg : F.args()) {
562 const bool IsByRef = Arg.hasByRefAttr();
563 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
564 Align Alignment = DL.getValueOrABITypeAlignment(
565 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
566 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
567 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
568 MaxAlign = std::max(MaxAlign, Alignment);
569 }
570
571 return ExplicitArgBytes;
572}
573
575 Align &MaxAlign) const {
576 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
577
578 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
579
580 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
581 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
582 if (ImplicitBytes != 0) {
583 const Align Alignment = getAlignmentForImplicitArgPtr();
584 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
585 MaxAlign = std::max(MaxAlign, Alignment);
586 }
587
588 // Being able to dereference past the end is useful for emitting scalar loads.
589 return alignTo(TotalSize, 4);
590}
591
595}
596
598 unsigned NumRegionInstrs) const {
599 // Track register pressure so the scheduler can try to decrease
600 // pressure once register usage is above the threshold defined by
601 // SIRegisterInfo::getRegPressureSetLimit()
602 Policy.ShouldTrackPressure = true;
603
604 // Enabling both top down and bottom up scheduling seems to give us less
605 // register spills than just using one of these approaches on its own.
606 Policy.OnlyTopDown = false;
607 Policy.OnlyBottomUp = false;
608
609 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
610 if (!enableSIScheduler())
611 Policy.ShouldTrackLaneMasks = true;
612}
613
615 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
616}
617
620}
621
622bool GCNSubtarget::useAA() const { return UseAA; }
623
624unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
626 return getMaxWavesPerEU();
627
629 if (SGPRs <= 80)
630 return 10;
631 if (SGPRs <= 88)
632 return 9;
633 if (SGPRs <= 100)
634 return 8;
635 return 7;
636 }
637 if (SGPRs <= 48)
638 return 10;
639 if (SGPRs <= 56)
640 return 9;
641 if (SGPRs <= 64)
642 return 8;
643 if (SGPRs <= 72)
644 return 7;
645 if (SGPRs <= 80)
646 return 6;
647 return 5;
648}
649
650unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
652}
653
654unsigned
655GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
657 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
658
659 if (HasFlatScratch || HasArchitectedFlatScratch) {
661 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
663 return 4; // FLAT_SCRATCH, VCC (in that order).
664 }
665
666 if (isXNACKEnabled())
667 return 4; // XNACK, VCC (in that order).
668 return 2; // VCC.
669}
670
674}
675
677 // In principle we do not need to reserve SGPR pair used for flat_scratch if
678 // we know flat instructions do not access the stack anywhere in the
679 // program. For now assume it's needed if we have flat instructions.
680 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
681 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
682}
683
684unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
685 unsigned NumSGPRs,
686 unsigned NumVGPRs) const {
687 unsigned Occupancy =
688 std::min(getMaxWavesPerEU(),
690 if (NumSGPRs)
691 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
692 if (NumVGPRs)
693 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
694 return Occupancy;
695}
696
698 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
699 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
700 // Compute maximum number of SGPRs function can use using default/requested
701 // minimum number of waves per execution unit.
702 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
703 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
704
705 // Check if maximum number of SGPRs was explicitly requested using
706 // "amdgpu-num-sgpr" attribute.
707 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
708 unsigned Requested =
709 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
710
711 // Make sure requested value does not violate subtarget's specifications.
712 if (Requested && (Requested <= ReservedNumSGPRs))
713 Requested = 0;
714
715 // If more SGPRs are required to support the input user/system SGPRs,
716 // increase to accommodate them.
717 //
718 // FIXME: This really ends up using the requested number of SGPRs + number
719 // of reserved special registers in total. Theoretically you could re-use
720 // the last input registers for these special registers, but this would
721 // require a lot of complexity to deal with the weird aliasing.
722 unsigned InputNumSGPRs = PreloadedSGPRs;
723 if (Requested && Requested < InputNumSGPRs)
724 Requested = InputNumSGPRs;
725
726 // Make sure requested value is compatible with values implied by
727 // default/requested minimum/maximum number of waves per execution unit.
728 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
729 Requested = 0;
730 if (WavesPerEU.second &&
731 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
732 Requested = 0;
733
734 if (Requested)
735 MaxNumSGPRs = Requested;
736 }
737
738 if (hasSGPRInitBug())
740
741 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
742}
743
745 const Function &F = MF.getFunction();
749}
750
751static unsigned getMaxNumPreloadedSGPRs() {
752 // Max number of user SGPRs
753 unsigned MaxUserSGPRs = 4 + // private segment buffer
754 2 + // Dispatch ptr
755 2 + // queue ptr
756 2 + // kernel segment ptr
757 2 + // dispatch ID
758 2 + // flat scratch init
759 2; // Implicit buffer ptr
760
761 // Max number of system SGPRs
762 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
763 1 + // WorkGroupIDY
764 1 + // WorkGroupIDZ
765 1 + // WorkGroupInfo
766 1; // private segment wave byte offset
767
768 // Max number of synthetic SGPRs
769 unsigned SyntheticSGPRs = 1; // LDSKernelId
770
771 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
772}
773
777}
778
780 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
781 // Compute maximum number of VGPRs function can use using default/requested
782 // minimum number of waves per execution unit.
783 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
784
785 // Check if maximum number of VGPRs was explicitly requested using
786 // "amdgpu-num-vgpr" attribute.
787 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
788 unsigned Requested =
789 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
790
791 if (hasGFX90AInsts())
792 Requested *= 2;
793
794 // Make sure requested value is compatible with values implied by
795 // default/requested minimum/maximum number of waves per execution unit.
796 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
797 Requested = 0;
798 if (WavesPerEU.second &&
799 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
800 Requested = 0;
801
802 if (Requested)
803 MaxNumVGPRs = Requested;
804 }
805
806 return MaxNumVGPRs;
807}
808
811}
812
814 const Function &F = MF.getFunction();
816 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
817}
818
820 int UseOpIdx, SDep &Dep) const {
821 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
822 !Def->isInstr() || !Use->isInstr())
823 return;
824
825 MachineInstr *DefI = Def->getInstr();
826 MachineInstr *UseI = Use->getInstr();
827
828 if (DefI->isBundle()) {
830 auto Reg = Dep.getReg();
833 unsigned Lat = 0;
834 for (++I; I != E && I->isBundledWithPred(); ++I) {
835 if (I->modifiesRegister(Reg, TRI))
836 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
837 else if (Lat)
838 --Lat;
839 }
840 Dep.setLatency(Lat);
841 } else if (UseI->isBundle()) {
843 auto Reg = Dep.getReg();
846 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
847 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
848 if (I->readsRegister(Reg, TRI))
849 break;
850 --Lat;
851 }
852 Dep.setLatency(Lat);
853 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
854 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
855 // implicit operands which come from the MCInstrDesc, which can fool
856 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
857 // pseudo operands.
859 DefI, DefOpIdx, UseI, UseOpIdx));
860 }
861}
862
863namespace {
864struct FillMFMAShadowMutation : ScheduleDAGMutation {
865 const SIInstrInfo *TII;
866
867 ScheduleDAGMI *DAG;
868
869 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
870
871 bool isSALU(const SUnit *SU) const {
872 const MachineInstr *MI = SU->getInstr();
873 return MI && TII->isSALU(*MI) && !MI->isTerminator();
874 }
875
876 bool isVALU(const SUnit *SU) const {
877 const MachineInstr *MI = SU->getInstr();
878 return MI && TII->isVALU(*MI);
879 }
880
881 // Link as many SALU instructions in chain as possible. Return the size
882 // of the chain. Links up to MaxChain instructions.
883 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
884 SmallPtrSetImpl<SUnit *> &Visited) const {
885 SmallVector<SUnit *, 8> Worklist({To});
886 unsigned Linked = 0;
887
888 while (!Worklist.empty() && MaxChain-- > 0) {
889 SUnit *SU = Worklist.pop_back_val();
890 if (!Visited.insert(SU).second)
891 continue;
892
893 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
894 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
895
896 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
897 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
898 ++Linked;
899
900 for (SDep &SI : From->Succs) {
901 SUnit *SUv = SI.getSUnit();
902 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
903 DAG->canAddEdge(SUv, SU))
904 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
905 }
906
907 for (SDep &SI : SU->Succs) {
908 SUnit *Succ = SI.getSUnit();
909 if (Succ != SU && isSALU(Succ))
910 Worklist.push_back(Succ);
911 }
912 }
913
914 return Linked;
915 }
916
917 void apply(ScheduleDAGInstrs *DAGInstrs) override {
918 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
919 if (!ST.hasMAIInsts())
920 return;
921 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
922 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
923 if (!TSchedModel || DAG->SUnits.empty())
924 return;
925
926 // Scan for MFMA long latency instructions and try to add a dependency
927 // of available SALU instructions to give them a chance to fill MFMA
928 // shadow. That is desirable to fill MFMA shadow with SALU instructions
929 // rather than VALU to prevent power consumption bursts and throttle.
930 auto LastSALU = DAG->SUnits.begin();
931 auto E = DAG->SUnits.end();
933 for (SUnit &SU : DAG->SUnits) {
934 MachineInstr &MAI = *SU.getInstr();
935 if (!TII->isMAI(MAI) ||
936 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
937 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
938 continue;
939
940 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
941
942 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
943 dbgs() << "Need " << Lat
944 << " instructions to cover latency.\n");
945
946 // Find up to Lat independent scalar instructions as early as
947 // possible such that they can be scheduled after this MFMA.
948 for ( ; Lat && LastSALU != E; ++LastSALU) {
949 if (Visited.count(&*LastSALU))
950 continue;
951
952 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
953 !DAG->canAddEdge(&*LastSALU, &SU))
954 continue;
955
956 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
957 }
958 }
959 }
960};
961} // namespace
962
964 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
965 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
966}
967
968std::unique_ptr<ScheduleDAGMutation>
970 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
971 : nullptr;
972}
973
975 if (NSAThreshold.getNumOccurrences() > 0)
976 return std::max(NSAThreshold.getValue(), 2u);
977
979 "amdgpu-nsa-threshold", -1);
980 if (Value > 0)
981 return std::max(Value, 2);
982
983 return 3;
984}
985
988 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
989 else
990 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
991}
992
994 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
995 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
996 else
997 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
998}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
if(VerifyEach)
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
return InstrInfo
unsigned UseOpIdx
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
This class provides the information for the target register banks.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
AMDGPUSubtarget(const Triple &TT)
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
Definition: APInt.h:75
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:674
bool hasFlat() const
Definition: GCNSubtarget.h:340
bool useVGPRIndexMode() const
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:64
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:266
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:961
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:233
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:229
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:570
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
Definition: GCNSubtarget.h:847
bool useAA() const override
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:851
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:176
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:272
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:558
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
Definition: GCNSubtarget.h:953
bool hasAddr64() const
Definition: GCNSubtarget.h:336
bool hasFP64() const
Definition: GCNSubtarget.h:312
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
~GCNSubtarget() override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:60
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1288
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
Metadata node.
Definition: Metadata.h:943
instr_iterator instr_end()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
bool isBundle() const
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1188
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:563
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:355
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
Definition: ilist_node.h:82
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
unsigned getCodeObjectVersion(const Module &M)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:189
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:201
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:192
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:219
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:214
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1298
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:508
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.