LLVM 22.0.0git
GCNSubtarget.cpp
Go to the documentation of this file.
1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
20#include "AMDGPUTargetMachine.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(true));
49
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
54
56
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive("+wavefrontsize")) {
80 if (!FS.contains_insensitive("wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive("wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive("wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
109 }
110
111 // We don't support FP64 for EG/NI atm.
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124 FlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130 FlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
142
144
147
148 TargetID.setTargetIDFromFeaturesString(FS);
149
150 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
151 << TargetID.getXnackSetting() << '\n');
152 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
153 << TargetID.getSramEccSetting() << '\n');
154
155 return *this;
156}
157
159 LLVMContext &Ctx = F.getContext();
160 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
161 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
162 Ctx.diagnose(DiagnosticInfoUnsupported(
163 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
164 }
165}
166
168 const GCNTargetMachine &TM)
169 : // clang-format off
170 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
171 AMDGPUSubtarget(TT),
172 TargetID(*this),
173 InstrItins(getInstrItineraryForCPU(GPU)),
174 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
175 TLInfo(TM, *this),
176 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
177 // clang-format on
180
181 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
182
183 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
184 InlineAsmLoweringInfo =
185 std::make_unique<InlineAsmLowering>(getTargetLowering());
186 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
187 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
188 InstSelector =
189 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
190}
191
193 return TSInfo.get();
194}
195
196unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197 if (getGeneration() < GFX10)
198 return 1;
199
200 switch (Opcode) {
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
205 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
206 case AMDGPU::V_LSHL_B64_e64:
207 case AMDGPU::V_LSHRREV_B64_e64:
208 case AMDGPU::V_LSHRREV_B64_gfx10:
209 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
210 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
211 case AMDGPU::V_LSHR_B64_e64:
212 case AMDGPU::V_ASHRREV_I64_e64:
213 case AMDGPU::V_ASHRREV_I64_gfx10:
214 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
215 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
216 case AMDGPU::V_ASHR_I64_e64:
217 return 1;
218 }
219
220 return 2;
221}
222
223/// This list was mostly derived from experimentation.
224bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
225 switch (Opcode) {
226 case AMDGPU::V_CVT_F16_F32_e32:
227 case AMDGPU::V_CVT_F16_F32_e64:
228 case AMDGPU::V_CVT_F16_U16_e32:
229 case AMDGPU::V_CVT_F16_U16_e64:
230 case AMDGPU::V_CVT_F16_I16_e32:
231 case AMDGPU::V_CVT_F16_I16_e64:
232 case AMDGPU::V_RCP_F16_e64:
233 case AMDGPU::V_RCP_F16_e32:
234 case AMDGPU::V_RSQ_F16_e64:
235 case AMDGPU::V_RSQ_F16_e32:
236 case AMDGPU::V_SQRT_F16_e64:
237 case AMDGPU::V_SQRT_F16_e32:
238 case AMDGPU::V_LOG_F16_e64:
239 case AMDGPU::V_LOG_F16_e32:
240 case AMDGPU::V_EXP_F16_e64:
241 case AMDGPU::V_EXP_F16_e32:
242 case AMDGPU::V_SIN_F16_e64:
243 case AMDGPU::V_SIN_F16_e32:
244 case AMDGPU::V_COS_F16_e64:
245 case AMDGPU::V_COS_F16_e32:
246 case AMDGPU::V_FLOOR_F16_e64:
247 case AMDGPU::V_FLOOR_F16_e32:
248 case AMDGPU::V_CEIL_F16_e64:
249 case AMDGPU::V_CEIL_F16_e32:
250 case AMDGPU::V_TRUNC_F16_e64:
251 case AMDGPU::V_TRUNC_F16_e32:
252 case AMDGPU::V_RNDNE_F16_e64:
253 case AMDGPU::V_RNDNE_F16_e32:
254 case AMDGPU::V_FRACT_F16_e64:
255 case AMDGPU::V_FRACT_F16_e32:
256 case AMDGPU::V_FREXP_MANT_F16_e64:
257 case AMDGPU::V_FREXP_MANT_F16_e32:
258 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
259 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
260 case AMDGPU::V_LDEXP_F16_e64:
261 case AMDGPU::V_LDEXP_F16_e32:
262 case AMDGPU::V_LSHLREV_B16_e64:
263 case AMDGPU::V_LSHLREV_B16_e32:
264 case AMDGPU::V_LSHRREV_B16_e64:
265 case AMDGPU::V_LSHRREV_B16_e32:
266 case AMDGPU::V_ASHRREV_I16_e64:
267 case AMDGPU::V_ASHRREV_I16_e32:
268 case AMDGPU::V_ADD_U16_e64:
269 case AMDGPU::V_ADD_U16_e32:
270 case AMDGPU::V_SUB_U16_e64:
271 case AMDGPU::V_SUB_U16_e32:
272 case AMDGPU::V_SUBREV_U16_e64:
273 case AMDGPU::V_SUBREV_U16_e32:
274 case AMDGPU::V_MUL_LO_U16_e64:
275 case AMDGPU::V_MUL_LO_U16_e32:
276 case AMDGPU::V_ADD_F16_e64:
277 case AMDGPU::V_ADD_F16_e32:
278 case AMDGPU::V_SUB_F16_e64:
279 case AMDGPU::V_SUB_F16_e32:
280 case AMDGPU::V_SUBREV_F16_e64:
281 case AMDGPU::V_SUBREV_F16_e32:
282 case AMDGPU::V_MUL_F16_e64:
283 case AMDGPU::V_MUL_F16_e32:
284 case AMDGPU::V_MAX_F16_e64:
285 case AMDGPU::V_MAX_F16_e32:
286 case AMDGPU::V_MIN_F16_e64:
287 case AMDGPU::V_MIN_F16_e32:
288 case AMDGPU::V_MAX_U16_e64:
289 case AMDGPU::V_MAX_U16_e32:
290 case AMDGPU::V_MIN_U16_e64:
291 case AMDGPU::V_MIN_U16_e32:
292 case AMDGPU::V_MAX_I16_e64:
293 case AMDGPU::V_MAX_I16_e32:
294 case AMDGPU::V_MIN_I16_e64:
295 case AMDGPU::V_MIN_I16_e32:
296 case AMDGPU::V_MAD_F16_e64:
297 case AMDGPU::V_MAD_U16_e64:
298 case AMDGPU::V_MAD_I16_e64:
299 case AMDGPU::V_FMA_F16_e64:
300 case AMDGPU::V_DIV_FIXUP_F16_e64:
301 // On gfx10, all 16-bit instructions preserve the high bits.
303 case AMDGPU::V_MADAK_F16:
304 case AMDGPU::V_MADMK_F16:
305 case AMDGPU::V_MAC_F16_e64:
306 case AMDGPU::V_MAC_F16_e32:
307 case AMDGPU::V_FMAMK_F16:
308 case AMDGPU::V_FMAAK_F16:
309 case AMDGPU::V_FMAC_F16_e64:
310 case AMDGPU::V_FMAC_F16_e32:
311 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
312 // instructions maintain the legacy behavior of 0ing. Some instructions
313 // changed to preserving the high bits.
315 case AMDGPU::V_MAD_MIXLO_F16:
316 case AMDGPU::V_MAD_MIXHI_F16:
317 default:
318 return false;
319 }
320}
321
323 const SchedRegion &Region) const {
324 // Track register pressure so the scheduler can try to decrease
325 // pressure once register usage is above the threshold defined by
326 // SIRegisterInfo::getRegPressureSetLimit()
327 Policy.ShouldTrackPressure = true;
328
329 // Enabling both top down and bottom up scheduling seems to give us less
330 // register spills than just using one of these approaches on its own.
331 Policy.OnlyTopDown = false;
332 Policy.OnlyBottomUp = false;
333
334 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
335 if (!enableSIScheduler())
336 Policy.ShouldTrackLaneMasks = true;
337}
338
340 const SchedRegion &Region) const {
341 const Function &F = Region.RegionBegin->getMF()->getFunction();
342 Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");
343 if (!PostRADirectionAttr.isValid())
344 return;
345
346 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
347 if (PostRADirectionStr == "topdown") {
348 Policy.OnlyTopDown = true;
349 Policy.OnlyBottomUp = false;
350 } else if (PostRADirectionStr == "bottomup") {
351 Policy.OnlyTopDown = false;
352 Policy.OnlyBottomUp = true;
353 } else if (PostRADirectionStr == "bidirectional") {
354 Policy.OnlyTopDown = false;
355 Policy.OnlyBottomUp = false;
356 } else {
358 F, F.getSubprogram(), "invalid value for postRA direction attribute");
359 F.getContext().diagnose(Diag);
360 }
361
362 LLVM_DEBUG({
363 const char *DirStr = "default";
364 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
365 DirStr = "topdown";
366 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
367 DirStr = "bottomup";
368 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
369 DirStr = "bidirectional";
370
371 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
372 << '\n';
373 });
374}
375
377 if (isWave32()) {
378 // Fix implicit $vcc operands after MIParser has verified that they match
379 // the instruction definitions.
380 for (auto &MBB : MF) {
381 for (auto &MI : MBB)
382 InstrInfo.fixImplicitOperands(MI);
383 }
384 }
385}
386
388 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
389}
390
394
395bool GCNSubtarget::useAA() const { return UseAA; }
396
401
402unsigned
404 unsigned DynamicVGPRBlockSize) const {
406 DynamicVGPRBlockSize);
407}
408
409unsigned
410GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
412 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
413
414 if (HasFlatScratch || HasArchitectedFlatScratch) {
416 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
418 return 4; // FLAT_SCRATCH, VCC (in that order).
419 }
420
421 if (isXNACKEnabled())
422 return 4; // XNACK, VCC (in that order).
423 return 2; // VCC.
424}
425
430
432 // In principle we do not need to reserve SGPR pair used for flat_scratch if
433 // we know flat instructions do not access the stack anywhere in the
434 // program. For now assume it's needed if we have flat instructions.
435 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
436 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
437}
438
439std::pair<unsigned, unsigned>
441 unsigned NumSGPRs, unsigned NumVGPRs) const {
442 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
443 // Temporarily check both the attribute and the subtarget feature until the
444 // latter is removed.
445 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
446 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
447
448 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
449 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
450 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
451
452 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
453 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
454 return {std::min(MinOcc, MaxOcc), MaxOcc};
455}
456
458 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
459 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
460 // Compute maximum number of SGPRs function can use using default/requested
461 // minimum number of waves per execution unit.
462 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
463 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
464
465 // Check if maximum number of SGPRs was explicitly requested using
466 // "amdgpu-num-sgpr" attribute.
467 unsigned Requested =
468 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
469
470 if (Requested != MaxNumSGPRs) {
471 // Make sure requested value does not violate subtarget's specifications.
472 if (Requested && (Requested <= ReservedNumSGPRs))
473 Requested = 0;
474
475 // If more SGPRs are required to support the input user/system SGPRs,
476 // increase to accommodate them.
477 //
478 // FIXME: This really ends up using the requested number of SGPRs + number
479 // of reserved special registers in total. Theoretically you could re-use
480 // the last input registers for these special registers, but this would
481 // require a lot of complexity to deal with the weird aliasing.
482 unsigned InputNumSGPRs = PreloadedSGPRs;
483 if (Requested && Requested < InputNumSGPRs)
484 Requested = InputNumSGPRs;
485
486 // Make sure requested value is compatible with values implied by
487 // default/requested minimum/maximum number of waves per execution unit.
488 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
489 Requested = 0;
490 if (WavesPerEU.second && Requested &&
491 Requested < getMinNumSGPRs(WavesPerEU.second))
492 Requested = 0;
493
494 if (Requested)
495 MaxNumSGPRs = Requested;
496 }
497
498 if (hasSGPRInitBug())
500
501 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
502}
503
505 const Function &F = MF.getFunction();
509}
510
512 using USI = GCNUserSGPRUsageInfo;
513 // Max number of user SGPRs
514 const unsigned MaxUserSGPRs =
515 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
516 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
517 USI::getNumUserSGPRForField(USI::QueuePtrID) +
518 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
519 USI::getNumUserSGPRForField(USI::DispatchIdID) +
520 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
521 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
522
523 // Max number of system SGPRs
524 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
525 1 + // WorkGroupIDY
526 1 + // WorkGroupIDZ
527 1 + // WorkGroupInfo
528 1; // private segment wave byte offset
529
530 // Max number of synthetic SGPRs
531 const unsigned SyntheticSGPRs = 1; // LDSKernelId
532
533 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
534}
535
540
542 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
543 const auto [Min, Max] = NumVGPRBounds;
544
545 // Check if maximum number of VGPRs was explicitly requested using
546 // "amdgpu-num-vgpr" attribute.
547
548 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
549 if (Requested != Max && hasGFX90AInsts())
550 Requested *= 2;
551
552 // Make sure requested value is inside the range of possible VGPR usage.
553 return std::clamp(Requested, Min, Max);
554}
555
557 // Temporarily check both the attribute and the subtarget feature, until the
558 // latter is removed.
559 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
560 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
561 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
562
563 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
564 return getBaseMaxNumVGPRs(
565 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
566 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
567}
568
570 return getMaxNumVGPRs(MF.getFunction());
571}
572
573std::pair<unsigned, unsigned>
575 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
576
577 unsigned MaxNumVGPRs = MaxVectorRegs;
578 unsigned MaxNumAGPRs = 0;
579 unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256;
580
581 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
582 // a wave may have up to 512 total vector registers combining together both
583 // VGPRs and AGPRs. Hence, in an entry function without calls and without
584 // AGPRs used within it, it is possible to use the whole vector register
585 // budget for VGPRs.
586 //
587 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
588 // register file accordingly.
589 if (hasGFX90AInsts()) {
590 unsigned MinNumAGPRs = 0;
591 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
592
593 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
594
595 // TODO: The lower bound should probably force the number of required
596 // registers up, overriding amdgpu-waves-per-eu.
597 std::tie(MinNumAGPRs, MaxNumAGPRs) =
598 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
599 /*OnlyFirstRequired=*/true);
600
601 if (MinNumAGPRs == DefaultNumAGPR.first) {
602 // Default to splitting half the registers if AGPRs are required.
603 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
604 } else {
605 // Align to accum_offset's allocation granularity.
606 MinNumAGPRs = alignTo(MinNumAGPRs, 4);
607
608 MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
609 }
610
611 // Clamp values to be inbounds of our limits, and ensure min <= max.
612
613 MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
614 MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
615
616 MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
617 MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
618
619 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
620 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
621 "invalid register counts");
622 } else if (hasMAIInsts()) {
623 // On gfx908 the number of AGPRs always equals the number of VGPRs.
624 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
625 }
626
627 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
628}
629
631 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
632 const TargetSchedModel *SchedModel) const {
633 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
634 !Use->isInstr())
635 return;
636
637 MachineInstr *DefI = Def->getInstr();
638 MachineInstr *UseI = Use->getInstr();
639
640 if (DefI->isBundle()) {
642 auto Reg = Dep.getReg();
645 unsigned Lat = 0;
646 for (++I; I != E && I->isBundledWithPred(); ++I) {
647 if (I->modifiesRegister(Reg, TRI))
648 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
649 else if (Lat)
650 --Lat;
651 }
652 Dep.setLatency(Lat);
653 } else if (UseI->isBundle()) {
655 auto Reg = Dep.getReg();
658 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
659 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
660 if (I->readsRegister(Reg, TRI))
661 break;
662 --Lat;
663 }
664 Dep.setLatency(Lat);
665 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
666 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
667 // implicit operands which come from the MCInstrDesc, which can fool
668 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
669 // pseudo operands.
670 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
671 DefI, DefOpIdx, UseI, UseOpIdx));
672 }
673}
674
677 return 0; // Not MIMG encoding.
678
679 if (NSAThreshold.getNumOccurrences() > 0)
680 return std::max(NSAThreshold.getValue(), 2u);
681
683 "amdgpu-nsa-threshold", -1);
684 if (Value > 0)
685 return std::max(Value, 2);
686
687 return NSAThreshold;
688}
689
691 const GCNSubtarget &ST)
692 : ST(ST) {
693 const CallingConv::ID CC = F.getCallingConv();
694 const bool IsKernel =
696
697 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
698 KernargSegmentPtr = true;
699
700 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
701 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
702 PrivateSegmentBuffer = true;
703 else if (ST.isMesaGfxShader(F))
704 ImplicitBufferPtr = true;
705
706 if (!AMDGPU::isGraphics(CC)) {
707 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
708 DispatchPtr = true;
709
710 // FIXME: Can this always be disabled with < COv5?
711 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
712 QueuePtr = true;
713
714 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
715 DispatchID = true;
716 }
717
718 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
719 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
720 // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
721 // is false.
722 (ST.enableFlatScratch() ||
723 (!AMDGPU::isGraphics(CC) &&
724 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
725 !ST.flatScratchIsArchitected()) {
726 FlatScratchInit = true;
727 }
728
730 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
731
734
735 if (hasDispatchPtr())
736 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
737
738 if (hasQueuePtr())
739 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
740
742 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
743
744 if (hasDispatchID())
745 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
746
747 if (hasFlatScratchInit())
748 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
749
751 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
752}
753
755 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
756 NumKernargPreloadSGPRs += NumSGPRs;
757 NumUsedUserSGPRs += NumSGPRs;
758}
759
761 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
762}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
This file defines the SmallString class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getWavefrontSizeLog2() const
unsigned AddressableLocalMemorySize
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:223
Diagnostic information for optimization failures.
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasFlat() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
bool hasMAIInsts() const
bool has1024AddressableVGPRs() const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Align getStackAlignment() const
bool hasMadF16() const
bool hasSGPRInitBug() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
const SITargetLowering * getTargetLowering() const override
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
unsigned getDynamicVGPRBlockSize() const
bool hasFP64() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Instructions::const_iterator const_instr_iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool isBundle() const
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Register getReg() const
Returns the register associated with this edge.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
Provide an instruction scheduling machine model to CodeGen passes.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
self_iterator getIterator()
Definition ilist_node.h:123
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getLocalMemorySize(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
A region of an MBB for scheduling.