LLVM  10.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
44  "amdgpu-disable-power-sched",
45  cl::desc("Disable scheduling to minimize mAI power bursts"),
46  cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
52  StringRef GPU, StringRef FS) {
53  SmallString<256> FullFS("+promote-alloca,");
54  FullFS += FS;
55  ParseSubtargetFeatures(GPU, FullFS);
56 
57  // FIXME: I don't think think Evergreen has any useful support for
58  // denormals, but should be checked. Should we issue a warning somewhere
59  // if someone tries to enable these?
61  FP32Denormals = false;
62  }
63 
66 
67  return *this;
68 }
69 
72  StringRef GPU, StringRef FS) {
73  // Determine default and user-specified characteristics
74  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75  // enabled, but some instructions do not respect them and they run at the
76  // double precision rate, so don't enable by default.
77  //
78  // We want to be able to turn these off, but making this a subtarget feature
79  // for SI has the unhelpful behavior that it unsets everything else if you
80  // disable it.
81  //
82  // Similarly we want enable-prt-strict-null to be on by default and not to
83  // unset everything else if it is disabled
84 
85  // Assuming ECC is enabled is the conservative default.
86  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89  FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91  // FIXME: I don't think think Evergreen has any useful support for
92  // denormals, but should be checked. Should we issue a warning somewhere
93  // if someone tries to enable these?
95  FullFS += "+fp64-fp16-denormals,";
96  } else {
97  FullFS += "-fp32-denormals,";
98  }
99 
100  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102  // Disable mutually exclusive bits.
103  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104  if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105  FullFS += "-wavefrontsize16,";
106  if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107  FullFS += "-wavefrontsize32,";
108  if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109  FullFS += "-wavefrontsize64,";
110  }
111 
112  FullFS += FS;
113 
114  ParseSubtargetFeatures(GPU, FullFS);
115 
116  // We don't support FP64 for EG/NI atm.
118 
119  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121  // variants of MUBUF instructions.
122  if (!hasAddr64() && !FS.contains("flat-for-global")) {
123  FlatForGlobal = true;
124  }
125 
126  // Set defaults if needed.
127  if (MaxPrivateElementSize == 0)
128  MaxPrivateElementSize = 4;
129 
130  if (LDSBankCount == 0)
131  LDSBankCount = 32;
132 
133  if (TT.getArch() == Triple::amdgcn) {
134  if (LocalMemorySize == 0)
135  LocalMemorySize = 32768;
136 
137  // Do something sensible for unspecified target.
138  if (!HasMovrel && !HasVGPRIndexMode)
139  HasMovrel = true;
140  }
141 
142  // Don't crash on invalid devices.
143  if (WavefrontSize == 0)
144  WavefrontSize = 64;
145 
147 
148  if (DoesNotSupportXNACK && EnableXNACK) {
149  ToggleFeature(AMDGPU::FeatureXNACK);
150  EnableXNACK = false;
151  }
152 
153  // ECC is on by default, but turn it off if the hardware doesn't support it
154  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155  // ECC.
156  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157  ToggleFeature(AMDGPU::FeatureSRAMECC);
158  EnableSRAMECC = false;
159  }
160 
161  return *this;
162 }
163 
165  TargetTriple(TT),
170  HasSDWA(false),
172  HasMulI24(true),
173  HasMulU24(true),
178  MaxWavesPerEU(10),
179  LocalMemorySize(0),
180  WavefrontSize(0)
181  { }
182 
184  const GCNTargetMachine &TM) :
185  AMDGPUGenSubtargetInfo(TT, GPU, FS),
186  AMDGPUSubtarget(TT),
187  TargetTriple(TT),
188  Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189  InstrItins(getInstrItineraryForCPU(GPU)),
190  LDSBankCount(0),
191  MaxPrivateElementSize(0),
192 
193  FastFMAF32(false),
194  HalfRate64Ops(false),
195 
196  FP64FP16Denormals(false),
197  FlatForGlobal(false),
198  AutoWaitcntBeforeBarrier(false),
199  CodeObjectV3(false),
200  UnalignedScratchAccess(false),
201  UnalignedBufferAccess(false),
202 
203  HasApertureRegs(false),
204  EnableXNACK(false),
205  DoesNotSupportXNACK(false),
206  EnableCuMode(false),
207  TrapHandler(false),
208 
210  EnableUnsafeDSOffsetFolding(false),
211  EnableSIScheduler(false),
212  EnableDS128(false),
213  EnablePRTStrictNull(false),
214  DumpCode(false),
215 
216  FP64(false),
217  GCN3Encoding(false),
218  CIInsts(false),
219  GFX8Insts(false),
220  GFX9Insts(false),
221  GFX10Insts(false),
222  GFX7GFX8GFX9Insts(false),
223  SGPRInitBug(false),
224  HasSMemRealTime(false),
225  HasIntClamp(false),
226  HasFmaMixInsts(false),
227  HasMovrel(false),
228  HasVGPRIndexMode(false),
229  HasScalarStores(false),
230  HasScalarAtomics(false),
231  HasSDWAOmod(false),
232  HasSDWAScalar(false),
233  HasSDWASdst(false),
234  HasSDWAMac(false),
235  HasSDWAOutModsVOPC(false),
236  HasDPP(false),
237  HasDPP8(false),
238  HasR128A16(false),
239  HasNSAEncoding(false),
240  HasDLInsts(false),
241  HasDot1Insts(false),
242  HasDot2Insts(false),
243  HasDot3Insts(false),
244  HasDot4Insts(false),
245  HasDot5Insts(false),
246  HasDot6Insts(false),
247  HasMAIInsts(false),
248  HasPkFmacF16Inst(false),
249  HasAtomicFaddInsts(false),
250  EnableSRAMECC(false),
251  DoesNotSupportSRAMECC(false),
252  HasNoSdstCMPX(false),
253  HasVscnt(false),
254  HasRegisterBanking(false),
255  HasVOP3Literal(false),
256  HasNoDataDepHazard(false),
257  FlatAddressSpace(false),
258  FlatInstOffsets(false),
259  FlatGlobalInsts(false),
260  FlatScratchInsts(false),
261  ScalarFlatScratchInsts(false),
262  AddNoCarryInsts(false),
263  HasUnpackedD16VMem(false),
264  LDSMisalignedBug(false),
265  HasMFMAInlineLiteralBug(false),
266 
268 
269  HasVcmpxPermlaneHazard(false),
270  HasVMEMtoScalarWriteHazard(false),
271  HasSMEMtoVectorWriteHazard(false),
272  HasInstFwdPrefetchBug(false),
273  HasVcmpxExecWARHazard(false),
274  HasLdsBranchVmemWARHazard(false),
275  HasNSAtoVMEMBug(false),
276  HasOffset3fBug(false),
277  HasFlatSegmentOffsetBug(false),
278 
279  FeatureDisable(false),
280  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
281  TLInfo(TM, *this),
282  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
284  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
285  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
286  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
287  InstSelector.reset(new AMDGPUInstructionSelector(
288  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
289 }
290 
291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
292  if (getGeneration() < GFX10)
293  return 1;
294 
295  switch (Opcode) {
296  case AMDGPU::V_LSHLREV_B64:
297  case AMDGPU::V_LSHLREV_B64_gfx10:
298  case AMDGPU::V_LSHL_B64:
299  case AMDGPU::V_LSHRREV_B64:
300  case AMDGPU::V_LSHRREV_B64_gfx10:
301  case AMDGPU::V_LSHR_B64:
302  case AMDGPU::V_ASHRREV_I64:
303  case AMDGPU::V_ASHRREV_I64_gfx10:
304  case AMDGPU::V_ASHR_I64:
305  return 1;
306  }
307 
308  return 2;
309 }
310 
312  const Function &F) const {
313  if (NWaves == 1)
314  return getLocalMemorySize();
315  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
316  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
317  if (!WorkGroupsPerCu)
318  return 0;
319  unsigned MaxWaves = getMaxWavesPerEU();
320  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
321 }
322 
324  const Function &F) const {
325  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
326  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
327  if (!WorkGroupsPerCu)
328  return 0;
329  unsigned MaxWaves = getMaxWavesPerEU();
330  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
331  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
332  NumWaves = std::min(NumWaves, MaxWaves);
333  NumWaves = std::max(NumWaves, 1u);
334  return NumWaves;
335 }
336 
337 unsigned
339  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
340  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
341 }
342 
343 std::pair<unsigned, unsigned>
345  switch (CC) {
349  return std::make_pair(getWavefrontSize() * 2,
350  std::max(getWavefrontSize() * 4, 256u));
357  return std::make_pair(1, getWavefrontSize());
358  default:
359  return std::make_pair(1, 16 * getWavefrontSize());
360  }
361 }
362 
363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364  const Function &F) const {
365  // FIXME: 1024 if function.
366  // Default minimum/maximum flat work group sizes.
367  std::pair<unsigned, unsigned> Default =
369 
370  // Requested minimum/maximum flat work group sizes.
371  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
372  F, "amdgpu-flat-work-group-size", Default);
373 
374  // Make sure requested minimum is less than requested maximum.
375  if (Requested.first > Requested.second)
376  return Default;
377 
378  // Make sure requested values do not violate subtarget's specifications.
379  if (Requested.first < getMinFlatWorkGroupSize())
380  return Default;
381  if (Requested.second > getMaxFlatWorkGroupSize())
382  return Default;
383 
384  return Requested;
385 }
386 
387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
388  const Function &F) const {
389  // Default minimum/maximum number of waves per execution unit.
390  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
391 
392  // Default/requested minimum/maximum flat work group sizes.
393  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
394 
395  // If minimum/maximum flat work group sizes were explicitly requested using
396  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
397  // number of waves per execution unit to values implied by requested
398  // minimum/maximum flat work group sizes.
399  unsigned MinImpliedByFlatWorkGroupSize =
400  getMaxWavesPerEU(FlatWorkGroupSizes.second);
401  bool RequestedFlatWorkGroupSize = false;
402 
403  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
404  Default.first = MinImpliedByFlatWorkGroupSize;
405  RequestedFlatWorkGroupSize = true;
406  }
407 
408  // Requested minimum/maximum number of waves per execution unit.
409  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
410  F, "amdgpu-waves-per-eu", Default, true);
411 
412  // Make sure requested minimum is less than requested maximum.
413  if (Requested.second && Requested.first > Requested.second)
414  return Default;
415 
416  // Make sure requested values do not violate subtarget's specifications.
417  if (Requested.first < getMinWavesPerEU() ||
418  Requested.first > getMaxWavesPerEU())
419  return Default;
420  if (Requested.second > getMaxWavesPerEU())
421  return Default;
422 
423  // Make sure requested values are compatible with values implied by requested
424  // minimum/maximum flat work group sizes.
425  if (RequestedFlatWorkGroupSize &&
426  Requested.first < MinImpliedByFlatWorkGroupSize)
427  return Default;
428 
429  return Requested;
430 }
431 
433  Function *Kernel = I->getParent()->getParent();
434  unsigned MinSize = 0;
435  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
436  bool IdQuery = false;
437 
438  // If reqd_work_group_size is present it narrows value down.
439  if (auto *CI = dyn_cast<CallInst>(I)) {
440  const Function *F = CI->getCalledFunction();
441  if (F) {
442  unsigned Dim = UINT_MAX;
443  switch (F->getIntrinsicID()) {
444  case Intrinsic::amdgcn_workitem_id_x:
445  case Intrinsic::r600_read_tidig_x:
446  IdQuery = true;
448  case Intrinsic::r600_read_local_size_x:
449  Dim = 0;
450  break;
451  case Intrinsic::amdgcn_workitem_id_y:
452  case Intrinsic::r600_read_tidig_y:
453  IdQuery = true;
455  case Intrinsic::r600_read_local_size_y:
456  Dim = 1;
457  break;
458  case Intrinsic::amdgcn_workitem_id_z:
459  case Intrinsic::r600_read_tidig_z:
460  IdQuery = true;
462  case Intrinsic::r600_read_local_size_z:
463  Dim = 2;
464  break;
465  default:
466  break;
467  }
468  if (Dim <= 3) {
469  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
470  if (Node->getNumOperands() == 3)
471  MinSize = MaxSize = mdconst::extract<ConstantInt>(
472  Node->getOperand(Dim))->getZExtValue();
473  }
474  }
475  }
476 
477  if (!MaxSize)
478  return false;
479 
480  // Range metadata is [Lo, Hi). For ID query we need to pass max size
481  // as Hi. For size query we need to pass Hi + 1.
482  if (IdQuery)
483  MinSize = 0;
484  else
485  ++MaxSize;
486 
487  MDBuilder MDB(I->getContext());
488  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
489  APInt(32, MaxSize));
490  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
491  return true;
492 }
493 
495  unsigned &MaxAlign) const {
498 
499  const DataLayout &DL = F.getParent()->getDataLayout();
500  uint64_t ExplicitArgBytes = 0;
501  MaxAlign = 1;
502 
503  for (const Argument &Arg : F.args()) {
504  Type *ArgTy = Arg.getType();
505 
506  unsigned Align = DL.getABITypeAlignment(ArgTy);
507  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
508  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
509  MaxAlign = std::max(MaxAlign, Align);
510  }
511 
512  return ExplicitArgBytes;
513 }
514 
516  unsigned &MaxAlign) const {
517  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
518 
519  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
520 
521  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
522  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
523  if (ImplicitBytes != 0) {
524  unsigned Alignment = getAlignmentForImplicitArgPtr();
525  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
526  }
527 
528  // Being able to dereference past the end is useful for emitting scalar loads.
529  return alignTo(TotalSize, 4);
530 }
531 
533  const TargetMachine &TM) :
534  R600GenSubtargetInfo(TT, GPU, FS),
535  AMDGPUSubtarget(TT),
536  InstrInfo(*this),
537  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
538  FMA(false),
539  CaymanISA(false),
540  CFALUBug(false),
543  FP64(false),
544  TexVTXClauseSize(0),
545  Gen(R600),
546  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
547  InstrItins(getInstrItineraryForCPU(GPU)) { }
548 
550  unsigned NumRegionInstrs) const {
551  // Track register pressure so the scheduler can try to decrease
552  // pressure once register usage is above the threshold defined by
553  // SIRegisterInfo::getRegPressureSetLimit()
554  Policy.ShouldTrackPressure = true;
555 
556  // Enabling both top down and bottom up scheduling seems to give us less
557  // register spills than just using one of these approaches on its own.
558  Policy.OnlyTopDown = false;
559  Policy.OnlyBottomUp = false;
560 
561  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
562  if (!enableSIScheduler())
563  Policy.ShouldTrackLaneMasks = true;
564 }
565 
567  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
568 }
569 
570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
572  return getMaxWavesPerEU();
573 
575  if (SGPRs <= 80)
576  return 10;
577  if (SGPRs <= 88)
578  return 9;
579  if (SGPRs <= 100)
580  return 8;
581  return 7;
582  }
583  if (SGPRs <= 48)
584  return 10;
585  if (SGPRs <= 56)
586  return 9;
587  if (SGPRs <= 64)
588  return 8;
589  if (SGPRs <= 72)
590  return 7;
591  if (SGPRs <= 80)
592  return 6;
593  return 5;
594 }
595 
596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
597  unsigned MaxWaves = getMaxWavesPerEU();
598  unsigned Granule = getVGPRAllocGranule();
599  if (VGPRs < Granule)
600  return MaxWaves;
601  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
602  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
603 }
604 
608  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
609 
610  if (MFI.hasFlatScratchInit()) {
612  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
614  return 4; // FLAT_SCRATCH, VCC (in that order).
615  }
616 
617  if (isXNACKEnabled())
618  return 4; // XNACK, VCC (in that order).
619  return 2; // VCC.
620 }
621 
623  unsigned LDSSize,
624  unsigned NumSGPRs,
625  unsigned NumVGPRs) const {
626  unsigned Occupancy =
627  std::min(getMaxWavesPerEU(),
629  if (NumSGPRs)
630  Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
631  if (NumVGPRs)
632  Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
633  return Occupancy;
634 }
635 
637  const Function &F = MF.getFunction();
639 
640  // Compute maximum number of SGPRs function can use using default/requested
641  // minimum number of waves per execution unit.
642  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
643  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
644  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
645 
646  // Check if maximum number of SGPRs was explicitly requested using
647  // "amdgpu-num-sgpr" attribute.
648  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
649  unsigned Requested = AMDGPU::getIntegerAttribute(
650  F, "amdgpu-num-sgpr", MaxNumSGPRs);
651 
652  // Make sure requested value does not violate subtarget's specifications.
653  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
654  Requested = 0;
655 
656  // If more SGPRs are required to support the input user/system SGPRs,
657  // increase to accommodate them.
658  //
659  // FIXME: This really ends up using the requested number of SGPRs + number
660  // of reserved special registers in total. Theoretically you could re-use
661  // the last input registers for these special registers, but this would
662  // require a lot of complexity to deal with the weird aliasing.
663  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
664  if (Requested && Requested < InputNumSGPRs)
665  Requested = InputNumSGPRs;
666 
667  // Make sure requested value is compatible with values implied by
668  // default/requested minimum/maximum number of waves per execution unit.
669  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
670  Requested = 0;
671  if (WavesPerEU.second &&
672  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
673  Requested = 0;
674 
675  if (Requested)
676  MaxNumSGPRs = Requested;
677  }
678 
679  if (hasSGPRInitBug())
681 
682  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
683  MaxAddressableNumSGPRs);
684 }
685 
687  const Function &F = MF.getFunction();
689 
690  // Compute maximum number of VGPRs function can use using default/requested
691  // minimum number of waves per execution unit.
692  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
693  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
694 
695  // Check if maximum number of VGPRs was explicitly requested using
696  // "amdgpu-num-vgpr" attribute.
697  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
698  unsigned Requested = AMDGPU::getIntegerAttribute(
699  F, "amdgpu-num-vgpr", MaxNumVGPRs);
700 
701  // Make sure requested value is compatible with values implied by
702  // default/requested minimum/maximum number of waves per execution unit.
703  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
704  Requested = 0;
705  if (WavesPerEU.second &&
706  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
707  Requested = 0;
708 
709  if (Requested)
710  MaxNumVGPRs = Requested;
711  }
712 
713  return MaxNumVGPRs;
714 }
715 
716 namespace {
717 struct MemOpClusterMutation : ScheduleDAGMutation {
718  const SIInstrInfo *TII;
719 
720  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
721 
722  void apply(ScheduleDAGInstrs *DAG) override {
723  SUnit *SUa = nullptr;
724  // Search for two consequent memory operations and link them
725  // to prevent scheduler from moving them apart.
726  // In DAG pre-process SUnits are in the original order of
727  // the instructions before scheduling.
728  for (SUnit &SU : DAG->SUnits) {
729  MachineInstr &MI2 = *SU.getInstr();
730  if (!MI2.mayLoad() && !MI2.mayStore()) {
731  SUa = nullptr;
732  continue;
733  }
734  if (!SUa) {
735  SUa = &SU;
736  continue;
737  }
738 
739  MachineInstr &MI1 = *SUa->getInstr();
740  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
741  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
742  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
743  (TII->isDS(MI1) && TII->isDS(MI2))) {
744  SU.addPredBarrier(SUa);
745 
746  for (const SDep &SI : SU.Preds) {
747  if (SI.getSUnit() != SUa)
748  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
749  }
750 
751  if (&SU != &DAG->ExitSU) {
752  for (const SDep &SI : SUa->Succs) {
753  if (SI.getSUnit() != &SU)
754  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
755  }
756  }
757  }
758 
759  SUa = &SU;
760  }
761  }
762 };
763 
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765  const SIInstrInfo *TII;
766 
767  ScheduleDAGMI *DAG;
768 
769  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
770 
771  bool isSALU(const SUnit *SU) const {
772  const MachineInstr *MI = SU->getInstr();
773  return MI && TII->isSALU(*MI) && !MI->isTerminator();
774  }
775 
776  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
777  if (Pred->NodeNum < Succ->NodeNum)
778  return true;
779 
780  SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
781 
782  for (unsigned I = 0; I < Succs.size(); ++I) {
783  for (const SDep &SI : Succs[I]->Succs) {
784  const SUnit *SU = SI.getSUnit();
785  if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
786  Succs.push_back(SU);
787  }
788  }
789 
791  while (!Preds.empty()) {
792  const SUnit *SU = Preds.pop_back_val();
793  if (llvm::find(Succs, SU) != Succs.end())
794  return false;
795  Visited.insert(SU);
796  for (const SDep &SI : SU->Preds)
797  if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
798  Preds.push_back(SI.getSUnit());
799  }
800 
801  return true;
802  }
803 
804  // Link as much SALU intructions in chain as possible. Return the size
805  // of the chain. Links up to MaxChain instructions.
806  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
807  SmallPtrSetImpl<SUnit *> &Visited) const {
808  SmallVector<SUnit *, 8> Worklist({To});
809  unsigned Linked = 0;
810 
811  while (!Worklist.empty() && MaxChain-- > 0) {
812  SUnit *SU = Worklist.pop_back_val();
813  if (!Visited.insert(SU).second)
814  continue;
815 
816  LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
817  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
818 
819  if (SU->addPred(SDep(From, SDep::Artificial), false))
820  ++Linked;
821 
822  for (SDep &SI : From->Succs) {
823  SUnit *SUv = SI.getSUnit();
824  if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
825  SUv->addPred(SDep(SU, SDep::Artificial), false);
826  }
827 
828  for (SDep &SI : SU->Succs) {
829  SUnit *Succ = SI.getSUnit();
830  if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
831  Worklist.push_back(Succ);
832  }
833  }
834 
835  return Linked;
836  }
837 
838  void apply(ScheduleDAGInstrs *DAGInstrs) override {
839  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
840  if (!ST.hasMAIInsts() || DisablePowerSched)
841  return;
842  DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
843  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
844  if (!TSchedModel || DAG->SUnits.empty())
845  return;
846 
847  // Scan for MFMA long latency instructions and try to add a dependency
848  // of available SALU instructions to give them a chance to fill MFMA
849  // shadow. That is desirable to fill MFMA shadow with SALU instructions
850  // rather than VALU to prevent power consumption bursts and throttle.
851  auto LastSALU = DAG->SUnits.begin();
852  auto E = DAG->SUnits.end();
853  SmallPtrSet<SUnit*, 32> Visited;
854  for (SUnit &SU : DAG->SUnits) {
855  MachineInstr &MAI = *SU.getInstr();
856  if (!TII->isMAI(MAI) ||
857  MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
858  MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
859  continue;
860 
861  unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
862 
863  LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
864  dbgs() << "Need " << Lat
865  << " instructions to cover latency.\n");
866 
867  // Find up to Lat independent scalar instructions as early as
868  // possible such that they can be scheduled after this MFMA.
869  for ( ; Lat && LastSALU != E; ++LastSALU) {
870  if (Visited.count(&*LastSALU))
871  continue;
872 
873  if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
874  continue;
875 
876  Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
877  }
878  }
879  }
880 };
881 } // namespace
882 
884  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
885  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
886  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
887 }
888 
891  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
892  else
893  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
894 }
895 
897  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
898  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
899  else
900  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
901 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static cl::opt< bool > DisablePowerSched("amdgpu-disable-power-sched", cl::desc("Disable scheduling to minimize mAI power bursts"), cl::init(false))
unsigned getImplicitArgNumBytes(const Function &F) const
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, Optional< bool > EnableWavefrontSize32)
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:733
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
block Block Frequency true
InstrItineraryData InstrItins
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:452
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:560
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
Provide an instruction scheduling machine model to CodeGen passes.
const HexagonInstrInfo * TII
int getLocalMemorySize() const
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1440
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1217
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:667
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:332
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:296
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:838
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Generation getGeneration() const
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:446
The AMDGPU TargetMachine interface definition for hw codgen targets.
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
BlockVerifier::State From
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:324
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Information about stack frame layout on the target.
LLVM_NODISCARD size_t find_lower(char C, size_t From=0) const
Search for the first character C in the string, ignoring case.
Definition: StringRef.cpp:57
bool hasCaymanISA() const
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Class for arbitrary precision integers.
Definition: APInt.h:69
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
void dumpNode(const SUnit &SU) const override
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:64
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:133
static const size_t npos
Definition: StringRef.h:50
AMDGPUSubtarget(const Triple &TT)
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:573
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:340
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:825
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:265
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool hasMAIInsts() const
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getConstantBusLimit(unsigned Opcode) const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
iterator_range< arg_iterator > args()
Definition: Function.h:719
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66