LLVM  9.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
47  StringRef GPU, StringRef FS) {
48  SmallString<256> FullFS("+promote-alloca,");
49  FullFS += FS;
50  ParseSubtargetFeatures(GPU, FullFS);
51 
52  // FIXME: I don't think think Evergreen has any useful support for
53  // denormals, but should be checked. Should we issue a warning somewhere
54  // if someone tries to enable these?
56  FP32Denormals = false;
57  }
58 
61 
62  return *this;
63 }
64 
67  StringRef GPU, StringRef FS) {
68  // Determine default and user-specified characteristics
69  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70  // enabled, but some instructions do not respect them and they run at the
71  // double precision rate, so don't enable by default.
72  //
73  // We want to be able to turn these off, but making this a subtarget feature
74  // for SI has the unhelpful behavior that it unsets everything else if you
75  // disable it.
76  //
77  // Similarly we want enable-prt-strict-null to be on by default and not to
78  // unset everything else if it is disabled
79 
80  // Assuming ECC is enabled is the conservative default.
81  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84  FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86  // FIXME: I don't think think Evergreen has any useful support for
87  // denormals, but should be checked. Should we issue a warning somewhere
88  // if someone tries to enable these?
90  FullFS += "+fp64-fp16-denormals,";
91  } else {
92  FullFS += "-fp32-denormals,";
93  }
94 
95  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97  // Disable mutually exclusive bits.
98  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99  if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100  FullFS += "-wavefrontsize16,";
101  if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102  FullFS += "-wavefrontsize32,";
103  if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104  FullFS += "-wavefrontsize64,";
105  }
106 
107  FullFS += FS;
108 
109  ParseSubtargetFeatures(GPU, FullFS);
110 
111  // We don't support FP64 for EG/NI atm.
113 
114  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116  // variants of MUBUF instructions.
117  if (!hasAddr64() && !FS.contains("flat-for-global")) {
118  FlatForGlobal = true;
119  }
120 
121  // Set defaults if needed.
122  if (MaxPrivateElementSize == 0)
123  MaxPrivateElementSize = 4;
124 
125  if (LDSBankCount == 0)
126  LDSBankCount = 32;
127 
128  if (TT.getArch() == Triple::amdgcn) {
129  if (LocalMemorySize == 0)
130  LocalMemorySize = 32768;
131 
132  // Do something sensible for unspecified target.
133  if (!HasMovrel && !HasVGPRIndexMode)
134  HasMovrel = true;
135  }
136 
137  // Don't crash on invalid devices.
138  if (WavefrontSize == 0)
139  WavefrontSize = 64;
140 
142 
143  if (DoesNotSupportXNACK && EnableXNACK) {
144  ToggleFeature(AMDGPU::FeatureXNACK);
145  EnableXNACK = false;
146  }
147 
148  // ECC is on by default, but turn it off if the hardware doesn't support it
149  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
150  // ECC.
151  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
152  ToggleFeature(AMDGPU::FeatureSRAMECC);
153  EnableSRAMECC = false;
154  }
155 
156  return *this;
157 }
158 
160  TargetTriple(TT),
165  HasSDWA(false),
167  HasMulI24(true),
168  HasMulU24(true),
173  LocalMemorySize(0),
174  WavefrontSize(0)
175  { }
176 
178  const GCNTargetMachine &TM) :
179  AMDGPUGenSubtargetInfo(TT, GPU, FS),
180  AMDGPUSubtarget(TT),
181  TargetTriple(TT),
182  Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
183  InstrItins(getInstrItineraryForCPU(GPU)),
184  LDSBankCount(0),
185  MaxPrivateElementSize(0),
186 
187  FastFMAF32(false),
188  HalfRate64Ops(false),
189 
190  FP64FP16Denormals(false),
191  FlatForGlobal(false),
192  AutoWaitcntBeforeBarrier(false),
193  CodeObjectV3(false),
194  UnalignedScratchAccess(false),
195  UnalignedBufferAccess(false),
196 
197  HasApertureRegs(false),
198  EnableXNACK(false),
199  DoesNotSupportXNACK(false),
200  EnableCuMode(false),
201  TrapHandler(false),
202 
204  EnableUnsafeDSOffsetFolding(false),
205  EnableSIScheduler(false),
206  EnableDS128(false),
207  EnablePRTStrictNull(false),
208  DumpCode(false),
209 
210  FP64(false),
211  GCN3Encoding(false),
212  CIInsts(false),
213  GFX8Insts(false),
214  GFX9Insts(false),
215  GFX10Insts(false),
216  GFX7GFX8GFX9Insts(false),
217  SGPRInitBug(false),
218  HasSMemRealTime(false),
219  HasIntClamp(false),
220  HasFmaMixInsts(false),
221  HasMovrel(false),
222  HasVGPRIndexMode(false),
223  HasScalarStores(false),
224  HasScalarAtomics(false),
225  HasSDWAOmod(false),
226  HasSDWAScalar(false),
227  HasSDWASdst(false),
228  HasSDWAMac(false),
229  HasSDWAOutModsVOPC(false),
230  HasDPP(false),
231  HasDPP8(false),
232  HasR128A16(false),
233  HasNSAEncoding(false),
234  HasDLInsts(false),
235  HasDot1Insts(false),
236  HasDot2Insts(false),
237  HasDot5Insts(false),
238  HasDot6Insts(false),
239  EnableSRAMECC(false),
240  DoesNotSupportSRAMECC(false),
241  HasNoSdstCMPX(false),
242  HasVscnt(false),
243  HasRegisterBanking(false),
244  HasVOP3Literal(false),
245  HasNoDataDepHazard(false),
246  FlatAddressSpace(false),
247  FlatInstOffsets(false),
248  FlatGlobalInsts(false),
249  FlatScratchInsts(false),
250  ScalarFlatScratchInsts(false),
251  AddNoCarryInsts(false),
252  HasUnpackedD16VMem(false),
253  LDSMisalignedBug(false),
254 
256 
257  HasVcmpxPermlaneHazard(false),
258  HasVMEMtoScalarWriteHazard(false),
259  HasSMEMtoVectorWriteHazard(false),
260  HasInstFwdPrefetchBug(false),
261  HasVcmpxExecWARHazard(false),
262  HasLdsBranchVmemWARHazard(false),
263  HasNSAtoVMEMBug(false),
264  HasFlatSegmentOffsetBug(false),
265 
266  FeatureDisable(false),
267  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
268  TLInfo(TM, *this),
269  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
270  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
271  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
272  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
273  InstSelector.reset(new AMDGPUInstructionSelector(
274  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
275 }
276 
277 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
278  if (getGeneration() < GFX10)
279  return 1;
280 
281  switch (Opcode) {
282  case AMDGPU::V_LSHLREV_B64:
283  case AMDGPU::V_LSHLREV_B64_gfx10:
284  case AMDGPU::V_LSHL_B64:
285  case AMDGPU::V_LSHRREV_B64:
286  case AMDGPU::V_LSHRREV_B64_gfx10:
287  case AMDGPU::V_LSHR_B64:
288  case AMDGPU::V_ASHRREV_I64:
289  case AMDGPU::V_ASHRREV_I64_gfx10:
290  case AMDGPU::V_ASHR_I64:
291  return 1;
292  }
293 
294  return 2;
295 }
296 
298  const Function &F) const {
299  if (NWaves == 1)
300  return getLocalMemorySize();
301  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
302  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
303  if (!WorkGroupsPerCu)
304  return 0;
305  unsigned MaxWaves = getMaxWavesPerEU();
306  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
307 }
308 
310  const Function &F) const {
311  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
312  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
313  if (!WorkGroupsPerCu)
314  return 0;
315  unsigned MaxWaves = getMaxWavesPerEU();
316  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
317  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
318  NumWaves = std::min(NumWaves, MaxWaves);
319  NumWaves = std::max(NumWaves, 1u);
320  return NumWaves;
321 }
322 
323 unsigned
325  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
326  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
327 }
328 
329 std::pair<unsigned, unsigned>
331  switch (CC) {
335  return std::make_pair(getWavefrontSize() * 2,
336  std::max(getWavefrontSize() * 4, 256u));
343  return std::make_pair(1, getWavefrontSize());
344  default:
345  return std::make_pair(1, 16 * getWavefrontSize());
346  }
347 }
348 
349 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
350  const Function &F) const {
351  // FIXME: 1024 if function.
352  // Default minimum/maximum flat work group sizes.
353  std::pair<unsigned, unsigned> Default =
355 
356  // Requested minimum/maximum flat work group sizes.
357  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
358  F, "amdgpu-flat-work-group-size", Default);
359 
360  // Make sure requested minimum is less than requested maximum.
361  if (Requested.first > Requested.second)
362  return Default;
363 
364  // Make sure requested values do not violate subtarget's specifications.
365  if (Requested.first < getMinFlatWorkGroupSize())
366  return Default;
367  if (Requested.second > getMaxFlatWorkGroupSize())
368  return Default;
369 
370  return Requested;
371 }
372 
373 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
374  const Function &F) const {
375  // Default minimum/maximum number of waves per execution unit.
376  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
377 
378  // Default/requested minimum/maximum flat work group sizes.
379  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
380 
381  // If minimum/maximum flat work group sizes were explicitly requested using
382  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
383  // number of waves per execution unit to values implied by requested
384  // minimum/maximum flat work group sizes.
385  unsigned MinImpliedByFlatWorkGroupSize =
386  getMaxWavesPerEU(FlatWorkGroupSizes.second);
387  bool RequestedFlatWorkGroupSize = false;
388 
389  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
390  Default.first = MinImpliedByFlatWorkGroupSize;
391  RequestedFlatWorkGroupSize = true;
392  }
393 
394  // Requested minimum/maximum number of waves per execution unit.
395  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
396  F, "amdgpu-waves-per-eu", Default, true);
397 
398  // Make sure requested minimum is less than requested maximum.
399  if (Requested.second && Requested.first > Requested.second)
400  return Default;
401 
402  // Make sure requested values do not violate subtarget's specifications.
403  if (Requested.first < getMinWavesPerEU() ||
404  Requested.first > getMaxWavesPerEU())
405  return Default;
406  if (Requested.second > getMaxWavesPerEU())
407  return Default;
408 
409  // Make sure requested values are compatible with values implied by requested
410  // minimum/maximum flat work group sizes.
411  if (RequestedFlatWorkGroupSize &&
412  Requested.first < MinImpliedByFlatWorkGroupSize)
413  return Default;
414 
415  return Requested;
416 }
417 
419  Function *Kernel = I->getParent()->getParent();
420  unsigned MinSize = 0;
421  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
422  bool IdQuery = false;
423 
424  // If reqd_work_group_size is present it narrows value down.
425  if (auto *CI = dyn_cast<CallInst>(I)) {
426  const Function *F = CI->getCalledFunction();
427  if (F) {
428  unsigned Dim = UINT_MAX;
429  switch (F->getIntrinsicID()) {
430  case Intrinsic::amdgcn_workitem_id_x:
431  case Intrinsic::r600_read_tidig_x:
432  IdQuery = true;
434  case Intrinsic::r600_read_local_size_x:
435  Dim = 0;
436  break;
437  case Intrinsic::amdgcn_workitem_id_y:
438  case Intrinsic::r600_read_tidig_y:
439  IdQuery = true;
441  case Intrinsic::r600_read_local_size_y:
442  Dim = 1;
443  break;
444  case Intrinsic::amdgcn_workitem_id_z:
445  case Intrinsic::r600_read_tidig_z:
446  IdQuery = true;
448  case Intrinsic::r600_read_local_size_z:
449  Dim = 2;
450  break;
451  default:
452  break;
453  }
454  if (Dim <= 3) {
455  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
456  if (Node->getNumOperands() == 3)
457  MinSize = MaxSize = mdconst::extract<ConstantInt>(
458  Node->getOperand(Dim))->getZExtValue();
459  }
460  }
461  }
462 
463  if (!MaxSize)
464  return false;
465 
466  // Range metadata is [Lo, Hi). For ID query we need to pass max size
467  // as Hi. For size query we need to pass Hi + 1.
468  if (IdQuery)
469  MinSize = 0;
470  else
471  ++MaxSize;
472 
473  MDBuilder MDB(I->getContext());
474  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
475  APInt(32, MaxSize));
476  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
477  return true;
478 }
479 
481  unsigned &MaxAlign) const {
484 
485  const DataLayout &DL = F.getParent()->getDataLayout();
486  uint64_t ExplicitArgBytes = 0;
487  MaxAlign = 1;
488 
489  for (const Argument &Arg : F.args()) {
490  Type *ArgTy = Arg.getType();
491 
492  unsigned Align = DL.getABITypeAlignment(ArgTy);
493  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
494  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
495  MaxAlign = std::max(MaxAlign, Align);
496  }
497 
498  return ExplicitArgBytes;
499 }
500 
502  unsigned &MaxAlign) const {
503  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
504 
505  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
506 
507  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
508  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
509  if (ImplicitBytes != 0) {
510  unsigned Alignment = getAlignmentForImplicitArgPtr();
511  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
512  }
513 
514  // Being able to dereference past the end is useful for emitting scalar loads.
515  return alignTo(TotalSize, 4);
516 }
517 
519  const TargetMachine &TM) :
520  R600GenSubtargetInfo(TT, GPU, FS),
521  AMDGPUSubtarget(TT),
522  InstrInfo(*this),
523  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
524  FMA(false),
525  CaymanISA(false),
526  CFALUBug(false),
529  FP64(false),
530  TexVTXClauseSize(0),
531  Gen(R600),
532  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
533  InstrItins(getInstrItineraryForCPU(GPU)) { }
534 
536  unsigned NumRegionInstrs) const {
537  // Track register pressure so the scheduler can try to decrease
538  // pressure once register usage is above the threshold defined by
539  // SIRegisterInfo::getRegPressureSetLimit()
540  Policy.ShouldTrackPressure = true;
541 
542  // Enabling both top down and bottom up scheduling seems to give us less
543  // register spills than just using one of these approaches on its own.
544  Policy.OnlyTopDown = false;
545  Policy.OnlyBottomUp = false;
546 
547  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
548  if (!enableSIScheduler())
549  Policy.ShouldTrackLaneMasks = true;
550 }
551 
553  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
554 }
555 
556 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
558  return 10;
559 
561  if (SGPRs <= 80)
562  return 10;
563  if (SGPRs <= 88)
564  return 9;
565  if (SGPRs <= 100)
566  return 8;
567  return 7;
568  }
569  if (SGPRs <= 48)
570  return 10;
571  if (SGPRs <= 56)
572  return 9;
573  if (SGPRs <= 64)
574  return 8;
575  if (SGPRs <= 72)
576  return 7;
577  if (SGPRs <= 80)
578  return 6;
579  return 5;
580 }
581 
582 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
583  if (VGPRs <= 24)
584  return 10;
585  if (VGPRs <= 28)
586  return 9;
587  if (VGPRs <= 32)
588  return 8;
589  if (VGPRs <= 36)
590  return 7;
591  if (VGPRs <= 40)
592  return 6;
593  if (VGPRs <= 48)
594  return 5;
595  if (VGPRs <= 64)
596  return 4;
597  if (VGPRs <= 84)
598  return 3;
599  if (VGPRs <= 128)
600  return 2;
601  return 1;
602 }
603 
607  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
608 
609  if (MFI.hasFlatScratchInit()) {
611  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613  return 4; // FLAT_SCRATCH, VCC (in that order).
614  }
615 
616  if (isXNACKEnabled())
617  return 4; // XNACK, VCC (in that order).
618  return 2; // VCC.
619 }
620 
622  const Function &F = MF.getFunction();
624 
625  // Compute maximum number of SGPRs function can use using default/requested
626  // minimum number of waves per execution unit.
627  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
628  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
629  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
630 
631  // Check if maximum number of SGPRs was explicitly requested using
632  // "amdgpu-num-sgpr" attribute.
633  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
634  unsigned Requested = AMDGPU::getIntegerAttribute(
635  F, "amdgpu-num-sgpr", MaxNumSGPRs);
636 
637  // Make sure requested value does not violate subtarget's specifications.
638  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
639  Requested = 0;
640 
641  // If more SGPRs are required to support the input user/system SGPRs,
642  // increase to accommodate them.
643  //
644  // FIXME: This really ends up using the requested number of SGPRs + number
645  // of reserved special registers in total. Theoretically you could re-use
646  // the last input registers for these special registers, but this would
647  // require a lot of complexity to deal with the weird aliasing.
648  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
649  if (Requested && Requested < InputNumSGPRs)
650  Requested = InputNumSGPRs;
651 
652  // Make sure requested value is compatible with values implied by
653  // default/requested minimum/maximum number of waves per execution unit.
654  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
655  Requested = 0;
656  if (WavesPerEU.second &&
657  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
658  Requested = 0;
659 
660  if (Requested)
661  MaxNumSGPRs = Requested;
662  }
663 
664  if (hasSGPRInitBug())
666 
667  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
668  MaxAddressableNumSGPRs);
669 }
670 
672  const Function &F = MF.getFunction();
674 
675  // Compute maximum number of VGPRs function can use using default/requested
676  // minimum number of waves per execution unit.
677  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
678  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
679 
680  // Check if maximum number of VGPRs was explicitly requested using
681  // "amdgpu-num-vgpr" attribute.
682  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
683  unsigned Requested = AMDGPU::getIntegerAttribute(
684  F, "amdgpu-num-vgpr", MaxNumVGPRs);
685 
686  // Make sure requested value is compatible with values implied by
687  // default/requested minimum/maximum number of waves per execution unit.
688  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
689  Requested = 0;
690  if (WavesPerEU.second &&
691  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
692  Requested = 0;
693 
694  if (Requested)
695  MaxNumVGPRs = Requested;
696  }
697 
698  return MaxNumVGPRs;
699 }
700 
701 namespace {
702 struct MemOpClusterMutation : ScheduleDAGMutation {
703  const SIInstrInfo *TII;
704 
705  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
706 
707  void apply(ScheduleDAGInstrs *DAG) override {
708  SUnit *SUa = nullptr;
709  // Search for two consequent memory operations and link them
710  // to prevent scheduler from moving them apart.
711  // In DAG pre-process SUnits are in the original order of
712  // the instructions before scheduling.
713  for (SUnit &SU : DAG->SUnits) {
714  MachineInstr &MI2 = *SU.getInstr();
715  if (!MI2.mayLoad() && !MI2.mayStore()) {
716  SUa = nullptr;
717  continue;
718  }
719  if (!SUa) {
720  SUa = &SU;
721  continue;
722  }
723 
724  MachineInstr &MI1 = *SUa->getInstr();
725  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
726  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
727  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
728  (TII->isDS(MI1) && TII->isDS(MI2))) {
729  SU.addPredBarrier(SUa);
730 
731  for (const SDep &SI : SU.Preds) {
732  if (SI.getSUnit() != SUa)
733  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
734  }
735 
736  if (&SU != &DAG->ExitSU) {
737  for (const SDep &SI : SUa->Succs) {
738  if (SI.getSUnit() != &SU)
739  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
740  }
741  }
742  }
743 
744  SUa = &SU;
745  }
746  }
747 };
748 } // namespace
749 
751  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
752  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
753 }
754 
757  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
758  else
759  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
760 }
761 
763  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
764  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
765  else
766  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
767 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:720
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:452
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
const HexagonInstrInfo * TII
int getLocalMemorySize() const
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1440
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1217
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:295
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:821
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Generation getGeneration() const
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:432
The AMDGPU TargetMachine interface definition for hw codgen targets.
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Information about stack frame layout on the target.
LLVM_NODISCARD size_t find_lower(char C, size_t From=0) const
Search for the first character C in the string, ignoring case.
Definition: StringRef.cpp:57
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:69
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:63
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static const size_t npos
Definition: StringRef.h:50
AMDGPUSubtarget(const Triple &TT)
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:340
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:808
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:250
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getConstantBusLimit(unsigned Opcode) const
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
iterator_range< arg_iterator > args()
Definition: Function.h:705
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
const SIRegisterInfo * getRegisterInfo() const override