LLVM 20.0.0git
GCNSubtarget.cpp
Go to the documentation of this file.
1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
20#include "AMDGPUTargetMachine.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool>
42 EnablePowerSched("amdgpu-enable-power-sched",
43 cl::desc("Enable scheduling to minimize mAI power bursts"),
44 cl::init(false));
45
47 "amdgpu-vgpr-index-mode",
48 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
49 cl::init(false));
50
51static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
52 cl::desc("Enable the use of AA during codegen."),
53 cl::init(true));
54
56 NSAThreshold("amdgpu-nsa-threshold",
57 cl::desc("Number of addresses from which to enable MIMG NSA."),
59
61
63 StringRef GPU,
64 StringRef FS) {
65 // Determine default and user-specified characteristics
66 //
67 // We want to be able to turn these off, but making this a subtarget feature
68 // for SI has the unhelpful behavior that it unsets everything else if you
69 // disable it.
70 //
71 // Similarly we want enable-prt-strict-null to be on by default and not to
72 // unset everything else if it is disabled
73
74 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
75
76 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
77 // default
78 if (isAmdHsaOS())
79 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
80
81 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
82
83 // Disable mutually exclusive bits.
84 if (FS.contains_insensitive("+wavefrontsize")) {
85 if (!FS.contains_insensitive("wavefrontsize16"))
86 FullFS += "-wavefrontsize16,";
87 if (!FS.contains_insensitive("wavefrontsize32"))
88 FullFS += "-wavefrontsize32,";
89 if (!FS.contains_insensitive("wavefrontsize64"))
90 FullFS += "-wavefrontsize64,";
91 }
92
93 FullFS += FS;
94
95 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
96
97 // Implement the "generic" processors, which acts as the default when no
98 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
99 // the first amdgcn target that supports flat addressing. Other OSes defaults
100 // to the first amdgcn target.
104 // Assume wave64 for the unknown target, if not explicitly set.
105 if (getWavefrontSizeLog2() == 0)
107 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
108 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
109 // If there is no default wave size it must be a generation before gfx10,
110 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
111 // set wave32 as a default.
112 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
114 }
115
116 // We don't support FP64 for EG/NI atm.
118
119 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
120 // support flat operations, otherwise they cannot access a 64-bit global
121 // address space
122 assert(hasAddr64() || hasFlat());
123 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124 // that do not support ADDR64 variants of MUBUF instructions. Such targets
125 // cannot use a 64 bit offset with a MUBUF instruction to access the global
126 // address space
127 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
128 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
129 FlatForGlobal = true;
130 }
131 // Unless +-flat-for-global is specified, use MUBUF instructions for global
132 // address space access if flat operations are not available.
133 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
134 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
135 FlatForGlobal = false;
136 }
137
138 // Set defaults if needed.
139 if (MaxPrivateElementSize == 0)
141
142 if (LDSBankCount == 0)
143 LDSBankCount = 32;
144
145 if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0)
147
149 if (AMDGPU::isGFX10Plus(*this) &&
150 !getFeatureBits().test(AMDGPU::FeatureCuMode))
151 LocalMemorySize *= 2;
152
155
157
158 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
159 << TargetID.getXnackSetting() << '\n');
160 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
161 << TargetID.getSramEccSetting() << '\n');
162
163 return *this;
164}
165
167 LLVMContext &Ctx = F.getContext();
168 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
169 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
171 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
172 }
173}
174
176 const GCNTargetMachine &TM)
177 : // clang-format off
178 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
179 AMDGPUSubtarget(TT),
180 TargetTriple(TT),
181 TargetID(*this),
182 InstrItins(getInstrItineraryForCPU(GPU)),
183 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
184 TLInfo(TM, *this),
185 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
186 // clang-format on
189
190 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
191
192 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
193 InlineAsmLoweringInfo =
194 std::make_unique<InlineAsmLowering>(getTargetLowering());
195 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
196 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
197 InstSelector =
198 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
199}
200
202 return TSInfo.get();
203}
204
205unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
206 if (getGeneration() < GFX10)
207 return 1;
208
209 switch (Opcode) {
210 case AMDGPU::V_LSHLREV_B64_e64:
211 case AMDGPU::V_LSHLREV_B64_gfx10:
212 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
213 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
214 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
215 case AMDGPU::V_LSHL_B64_e64:
216 case AMDGPU::V_LSHRREV_B64_e64:
217 case AMDGPU::V_LSHRREV_B64_gfx10:
218 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
219 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
220 case AMDGPU::V_LSHR_B64_e64:
221 case AMDGPU::V_ASHRREV_I64_e64:
222 case AMDGPU::V_ASHRREV_I64_gfx10:
223 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
224 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
225 case AMDGPU::V_ASHR_I64_e64:
226 return 1;
227 }
228
229 return 2;
230}
231
232/// This list was mostly derived from experimentation.
233bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
234 switch (Opcode) {
235 case AMDGPU::V_CVT_F16_F32_e32:
236 case AMDGPU::V_CVT_F16_F32_e64:
237 case AMDGPU::V_CVT_F16_U16_e32:
238 case AMDGPU::V_CVT_F16_U16_e64:
239 case AMDGPU::V_CVT_F16_I16_e32:
240 case AMDGPU::V_CVT_F16_I16_e64:
241 case AMDGPU::V_RCP_F16_e64:
242 case AMDGPU::V_RCP_F16_e32:
243 case AMDGPU::V_RSQ_F16_e64:
244 case AMDGPU::V_RSQ_F16_e32:
245 case AMDGPU::V_SQRT_F16_e64:
246 case AMDGPU::V_SQRT_F16_e32:
247 case AMDGPU::V_LOG_F16_e64:
248 case AMDGPU::V_LOG_F16_e32:
249 case AMDGPU::V_EXP_F16_e64:
250 case AMDGPU::V_EXP_F16_e32:
251 case AMDGPU::V_SIN_F16_e64:
252 case AMDGPU::V_SIN_F16_e32:
253 case AMDGPU::V_COS_F16_e64:
254 case AMDGPU::V_COS_F16_e32:
255 case AMDGPU::V_FLOOR_F16_e64:
256 case AMDGPU::V_FLOOR_F16_e32:
257 case AMDGPU::V_CEIL_F16_e64:
258 case AMDGPU::V_CEIL_F16_e32:
259 case AMDGPU::V_TRUNC_F16_e64:
260 case AMDGPU::V_TRUNC_F16_e32:
261 case AMDGPU::V_RNDNE_F16_e64:
262 case AMDGPU::V_RNDNE_F16_e32:
263 case AMDGPU::V_FRACT_F16_e64:
264 case AMDGPU::V_FRACT_F16_e32:
265 case AMDGPU::V_FREXP_MANT_F16_e64:
266 case AMDGPU::V_FREXP_MANT_F16_e32:
267 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
268 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
269 case AMDGPU::V_LDEXP_F16_e64:
270 case AMDGPU::V_LDEXP_F16_e32:
271 case AMDGPU::V_LSHLREV_B16_e64:
272 case AMDGPU::V_LSHLREV_B16_e32:
273 case AMDGPU::V_LSHRREV_B16_e64:
274 case AMDGPU::V_LSHRREV_B16_e32:
275 case AMDGPU::V_ASHRREV_I16_e64:
276 case AMDGPU::V_ASHRREV_I16_e32:
277 case AMDGPU::V_ADD_U16_e64:
278 case AMDGPU::V_ADD_U16_e32:
279 case AMDGPU::V_SUB_U16_e64:
280 case AMDGPU::V_SUB_U16_e32:
281 case AMDGPU::V_SUBREV_U16_e64:
282 case AMDGPU::V_SUBREV_U16_e32:
283 case AMDGPU::V_MUL_LO_U16_e64:
284 case AMDGPU::V_MUL_LO_U16_e32:
285 case AMDGPU::V_ADD_F16_e64:
286 case AMDGPU::V_ADD_F16_e32:
287 case AMDGPU::V_SUB_F16_e64:
288 case AMDGPU::V_SUB_F16_e32:
289 case AMDGPU::V_SUBREV_F16_e64:
290 case AMDGPU::V_SUBREV_F16_e32:
291 case AMDGPU::V_MUL_F16_e64:
292 case AMDGPU::V_MUL_F16_e32:
293 case AMDGPU::V_MAX_F16_e64:
294 case AMDGPU::V_MAX_F16_e32:
295 case AMDGPU::V_MIN_F16_e64:
296 case AMDGPU::V_MIN_F16_e32:
297 case AMDGPU::V_MAX_U16_e64:
298 case AMDGPU::V_MAX_U16_e32:
299 case AMDGPU::V_MIN_U16_e64:
300 case AMDGPU::V_MIN_U16_e32:
301 case AMDGPU::V_MAX_I16_e64:
302 case AMDGPU::V_MAX_I16_e32:
303 case AMDGPU::V_MIN_I16_e64:
304 case AMDGPU::V_MIN_I16_e32:
305 case AMDGPU::V_MAD_F16_e64:
306 case AMDGPU::V_MAD_U16_e64:
307 case AMDGPU::V_MAD_I16_e64:
308 case AMDGPU::V_FMA_F16_e64:
309 case AMDGPU::V_DIV_FIXUP_F16_e64:
310 // On gfx10, all 16-bit instructions preserve the high bits.
312 case AMDGPU::V_MADAK_F16:
313 case AMDGPU::V_MADMK_F16:
314 case AMDGPU::V_MAC_F16_e64:
315 case AMDGPU::V_MAC_F16_e32:
316 case AMDGPU::V_FMAMK_F16:
317 case AMDGPU::V_FMAAK_F16:
318 case AMDGPU::V_FMAC_F16_e64:
319 case AMDGPU::V_FMAC_F16_e32:
320 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
321 // instructions maintain the legacy behavior of 0ing. Some instructions
322 // changed to preserving the high bits.
324 case AMDGPU::V_MAD_MIXLO_F16:
325 case AMDGPU::V_MAD_MIXHI_F16:
326 default:
327 return false;
328 }
329}
330
332 unsigned NumRegionInstrs) const {
333 // Track register pressure so the scheduler can try to decrease
334 // pressure once register usage is above the threshold defined by
335 // SIRegisterInfo::getRegPressureSetLimit()
336 Policy.ShouldTrackPressure = true;
337
338 // Enabling both top down and bottom up scheduling seems to give us less
339 // register spills than just using one of these approaches on its own.
340 Policy.OnlyTopDown = false;
341 Policy.OnlyBottomUp = false;
342
343 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
344 if (!enableSIScheduler())
345 Policy.ShouldTrackLaneMasks = true;
346}
347
349 if (isWave32()) {
350 // Fix implicit $vcc operands after MIParser has verified that they match
351 // the instruction definitions.
352 for (auto &MBB : MF) {
353 for (auto &MI : MBB)
354 InstrInfo.fixImplicitOperands(MI);
355 }
356 }
357}
358
360 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
361}
362
365}
366
367bool GCNSubtarget::useAA() const { return UseAA; }
368
369unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
371 getGeneration());
372}
373
374unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
376}
377
378unsigned
379GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
381 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
382
383 if (HasFlatScratch || HasArchitectedFlatScratch) {
385 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
387 return 4; // FLAT_SCRATCH, VCC (in that order).
388 }
389
390 if (isXNACKEnabled())
391 return 4; // XNACK, VCC (in that order).
392 return 2; // VCC.
393}
394
398}
399
401 // In principle we do not need to reserve SGPR pair used for flat_scratch if
402 // we know flat instructions do not access the stack anywhere in the
403 // program. For now assume it's needed if we have flat instructions.
404 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
405 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
406}
407
408unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
409 unsigned NumSGPRs,
410 unsigned NumVGPRs) const {
411 unsigned Occupancy =
412 std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F));
413 if (NumSGPRs)
414 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
415 if (NumVGPRs)
416 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
417 return Occupancy;
418}
419
421 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
422 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
423 // Compute maximum number of SGPRs function can use using default/requested
424 // minimum number of waves per execution unit.
425 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
426 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
427
428 // Check if maximum number of SGPRs was explicitly requested using
429 // "amdgpu-num-sgpr" attribute.
430 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
431 unsigned Requested =
432 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
433
434 // Make sure requested value does not violate subtarget's specifications.
435 if (Requested && (Requested <= ReservedNumSGPRs))
436 Requested = 0;
437
438 // If more SGPRs are required to support the input user/system SGPRs,
439 // increase to accommodate them.
440 //
441 // FIXME: This really ends up using the requested number of SGPRs + number
442 // of reserved special registers in total. Theoretically you could re-use
443 // the last input registers for these special registers, but this would
444 // require a lot of complexity to deal with the weird aliasing.
445 unsigned InputNumSGPRs = PreloadedSGPRs;
446 if (Requested && Requested < InputNumSGPRs)
447 Requested = InputNumSGPRs;
448
449 // Make sure requested value is compatible with values implied by
450 // default/requested minimum/maximum number of waves per execution unit.
451 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
452 Requested = 0;
453 if (WavesPerEU.second && Requested &&
454 Requested < getMinNumSGPRs(WavesPerEU.second))
455 Requested = 0;
456
457 if (Requested)
458 MaxNumSGPRs = Requested;
459 }
460
461 if (hasSGPRInitBug())
463
464 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
465}
466
468 const Function &F = MF.getFunction();
472}
473
474static unsigned getMaxNumPreloadedSGPRs() {
475 using USI = GCNUserSGPRUsageInfo;
476 // Max number of user SGPRs
477 const unsigned MaxUserSGPRs =
478 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
479 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
480 USI::getNumUserSGPRForField(USI::QueuePtrID) +
481 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
482 USI::getNumUserSGPRForField(USI::DispatchIdID) +
483 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
484 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
485
486 // Max number of system SGPRs
487 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
488 1 + // WorkGroupIDY
489 1 + // WorkGroupIDZ
490 1 + // WorkGroupInfo
491 1; // private segment wave byte offset
492
493 // Max number of synthetic SGPRs
494 const unsigned SyntheticSGPRs = 1; // LDSKernelId
495
496 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
497}
498
502}
503
505 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
506 // Compute maximum number of VGPRs function can use using default/requested
507 // minimum number of waves per execution unit.
508 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
509
510 // Check if maximum number of VGPRs was explicitly requested using
511 // "amdgpu-num-vgpr" attribute.
512 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
513 unsigned Requested =
514 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
515
516 if (hasGFX90AInsts())
517 Requested *= 2;
518
519 // Make sure requested value is compatible with values implied by
520 // default/requested minimum/maximum number of waves per execution unit.
521 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
522 Requested = 0;
523 if (WavesPerEU.second && Requested &&
524 Requested < getMinNumVGPRs(WavesPerEU.second))
525 Requested = 0;
526
527 if (Requested)
528 MaxNumVGPRs = Requested;
529 }
530
531 return MaxNumVGPRs;
532}
533
536}
537
539 const Function &F = MF.getFunction();
541 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
542}
543
545 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
546 const TargetSchedModel *SchedModel) const {
547 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
548 !Use->isInstr())
549 return;
550
551 MachineInstr *DefI = Def->getInstr();
552 MachineInstr *UseI = Use->getInstr();
553
554 if (DefI->isBundle()) {
556 auto Reg = Dep.getReg();
559 unsigned Lat = 0;
560 for (++I; I != E && I->isBundledWithPred(); ++I) {
561 if (I->modifiesRegister(Reg, TRI))
562 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
563 else if (Lat)
564 --Lat;
565 }
566 Dep.setLatency(Lat);
567 } else if (UseI->isBundle()) {
569 auto Reg = Dep.getReg();
572 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
573 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
574 if (I->readsRegister(Reg, TRI))
575 break;
576 --Lat;
577 }
578 Dep.setLatency(Lat);
579 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
580 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
581 // implicit operands which come from the MCInstrDesc, which can fool
582 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
583 // pseudo operands.
585 DefI, DefOpIdx, UseI, UseOpIdx));
586 }
587}
588
589namespace {
590struct FillMFMAShadowMutation : ScheduleDAGMutation {
591 const SIInstrInfo *TII;
592
593 ScheduleDAGMI *DAG;
594
595 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
596
597 bool isSALU(const SUnit *SU) const {
598 const MachineInstr *MI = SU->getInstr();
599 return MI && TII->isSALU(*MI) && !MI->isTerminator();
600 }
601
602 bool isVALU(const SUnit *SU) const {
603 const MachineInstr *MI = SU->getInstr();
604 return MI && TII->isVALU(*MI);
605 }
606
607 // Link as many SALU instructions in chain as possible. Return the size
608 // of the chain. Links up to MaxChain instructions.
609 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
610 SmallPtrSetImpl<SUnit *> &Visited) const {
611 SmallVector<SUnit *, 8> Worklist({To});
612 unsigned Linked = 0;
613
614 while (!Worklist.empty() && MaxChain-- > 0) {
615 SUnit *SU = Worklist.pop_back_val();
616 if (!Visited.insert(SU).second)
617 continue;
618
619 LLVM_DEBUG(dbgs() << "Inserting edge from\n"; DAG->dumpNode(*From);
620 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
621
622 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
623 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
624 ++Linked;
625
626 for (SDep &SI : From->Succs) {
627 SUnit *SUv = SI.getSUnit();
628 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
629 DAG->canAddEdge(SUv, SU))
630 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
631 }
632
633 for (SDep &SI : SU->Succs) {
634 SUnit *Succ = SI.getSUnit();
635 if (Succ != SU && isSALU(Succ))
636 Worklist.push_back(Succ);
637 }
638 }
639
640 return Linked;
641 }
642
643 void apply(ScheduleDAGInstrs *DAGInstrs) override {
644 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
645 if (!ST.hasMAIInsts())
646 return;
647 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
648 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
649 if (!TSchedModel || DAG->SUnits.empty())
650 return;
651
652 // Scan for MFMA long latency instructions and try to add a dependency
653 // of available SALU instructions to give them a chance to fill MFMA
654 // shadow. That is desirable to fill MFMA shadow with SALU instructions
655 // rather than VALU to prevent power consumption bursts and throttle.
656 auto LastSALU = DAG->SUnits.begin();
657 auto E = DAG->SUnits.end();
659 for (SUnit &SU : DAG->SUnits) {
660 MachineInstr &MAI = *SU.getInstr();
661 if (!TII->isMAI(MAI) ||
662 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
664 continue;
665
666 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
667
668 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
669 dbgs() << "Need " << Lat
670 << " instructions to cover latency.\n");
671
672 // Find up to Lat independent scalar instructions as early as
673 // possible such that they can be scheduled after this MFMA.
674 for (; Lat && LastSALU != E; ++LastSALU) {
675 if (Visited.count(&*LastSALU))
676 continue;
677
678 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
679 !DAG->canAddEdge(&*LastSALU, &SU))
680 continue;
681
682 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
683 }
684 }
685 }
686};
687} // namespace
688
690 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
691 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
692}
693
694std::unique_ptr<ScheduleDAGMutation>
696 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
697 : nullptr;
698}
699
702 return 0; // Not MIMG encoding.
703
704 if (NSAThreshold.getNumOccurrences() > 0)
705 return std::max(NSAThreshold.getValue(), 2u);
706
708 "amdgpu-nsa-threshold", -1);
709 if (Value > 0)
710 return std::max(Value, 2);
711
712 return NSAThreshold;
713}
714
716 const GCNSubtarget &ST)
717 : ST(ST) {
718 const CallingConv::ID CC = F.getCallingConv();
719 const bool IsKernel =
721 // FIXME: Should have analysis or something rather than attribute to detect
722 // calls.
723 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
724 // FIXME: This attribute is a hack, we just need an analysis on the function
725 // to look for allocas.
726 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
727
728 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
729 KernargSegmentPtr = true;
730
731 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
732 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
733 PrivateSegmentBuffer = true;
734 else if (ST.isMesaGfxShader(F))
735 ImplicitBufferPtr = true;
736
737 if (!AMDGPU::isGraphics(CC)) {
738 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
739 DispatchPtr = true;
740
741 // FIXME: Can this always be disabled with < COv5?
742 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
743 QueuePtr = true;
744
745 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
746 DispatchID = true;
747 }
748
749 // TODO: This could be refined a lot. The attribute is a poor way of
750 // detecting calls or stack objects that may require it before argument
751 // lowering.
752 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
753 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
754 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
755 !ST.flatScratchIsArchitected()) {
756 FlatScratchInit = true;
757 }
758
760 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
761
764
765 if (hasDispatchPtr())
766 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
767
768 if (hasQueuePtr())
769 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
770
772 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
773
774 if (hasDispatchID())
775 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
776
777 if (hasFlatScratchInit())
778 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
779
781 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
782}
783
785 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
786 NumKernargPreloadSGPRs += NumSGPRs;
787 NumUsedUserSGPRs += NumSGPRs;
788}
789
791 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
792}
@ HasCalls
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(...)
Definition: Debug.h:106
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned AddressableLocalMemorySize
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
bool hasFlat() const
Definition: GCNSubtarget.h:395
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:68
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:321
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
bool hasMadF16() const
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:633
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:211
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:619
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool hasFP64() const
Definition: GCNSubtarget.h:371
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:64
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool isBundle() const
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1447
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:132
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1309
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.