LLVM  9.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
47  StringRef GPU, StringRef FS) {
48  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
49  FullFS += FS;
50  ParseSubtargetFeatures(GPU, FullFS);
51 
52  // FIXME: I don't think think Evergreen has any useful support for
53  // denormals, but should be checked. Should we issue a warning somewhere
54  // if someone tries to enable these?
56  FP32Denormals = false;
57  }
58 
61 
62  return *this;
63 }
64 
67  StringRef GPU, StringRef FS) {
68  // Determine default and user-specified characteristics
69  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70  // enabled, but some instructions do not respect them and they run at the
71  // double precision rate, so don't enable by default.
72  //
73  // We want to be able to turn these off, but making this a subtarget feature
74  // for SI has the unhelpful behavior that it unsets everything else if you
75  // disable it.
76  //
77  // Similarly we want enable-prt-strict-null to be on by default and not to
78  // unset everything else if it is disabled
79 
80  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
81 
82  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83  FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
84 
85  // FIXME: I don't think think Evergreen has any useful support for
86  // denormals, but should be checked. Should we issue a warning somewhere
87  // if someone tries to enable these?
89  FullFS += "+fp64-fp16-denormals,";
90  } else {
91  FullFS += "-fp32-denormals,";
92  }
93 
94  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
95 
96  FullFS += FS;
97 
98  ParseSubtargetFeatures(GPU, FullFS);
99 
100  // We don't support FP64 for EG/NI atm.
102 
103  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105  // variants of MUBUF instructions.
106  if (!hasAddr64() && !FS.contains("flat-for-global")) {
107  FlatForGlobal = true;
108  }
109 
110  // Set defaults if needed.
111  if (MaxPrivateElementSize == 0)
112  MaxPrivateElementSize = 4;
113 
114  if (LDSBankCount == 0)
115  LDSBankCount = 32;
116 
117  if (TT.getArch() == Triple::amdgcn) {
118  if (LocalMemorySize == 0)
119  LocalMemorySize = 32768;
120 
121  // Do something sensible for unspecified target.
122  if (!HasMovrel && !HasVGPRIndexMode)
123  HasMovrel = true;
124  }
125 
126  // Don't crash on invalid devices.
127  if (WavefrontSize == 0)
128  WavefrontSize = 64;
129 
131 
132  return *this;
133 }
134 
136  TargetTriple(TT),
141  HasSDWA(false),
143  HasMulI24(true),
144  HasMulU24(true),
149  LocalMemorySize(0),
150  WavefrontSize(0)
151  { }
152 
154  const GCNTargetMachine &TM) :
155  AMDGPUGenSubtargetInfo(TT, GPU, FS),
156  AMDGPUSubtarget(TT),
157  TargetTriple(TT),
158  Gen(SOUTHERN_ISLANDS),
159  InstrItins(getInstrItineraryForCPU(GPU)),
160  LDSBankCount(0),
161  MaxPrivateElementSize(0),
162 
163  FastFMAF32(false),
164  HalfRate64Ops(false),
165 
166  FP64FP16Denormals(false),
167  DX10Clamp(false),
168  FlatForGlobal(false),
169  AutoWaitcntBeforeBarrier(false),
170  CodeObjectV3(false),
171  UnalignedScratchAccess(false),
172  UnalignedBufferAccess(false),
173 
174  HasApertureRegs(false),
175  EnableXNACK(false),
176  TrapHandler(false),
177  DebuggerInsertNops(false),
178  DebuggerEmitPrologue(false),
179 
180  EnableHugePrivateBuffer(false),
182  EnableUnsafeDSOffsetFolding(false),
183  EnableSIScheduler(false),
184  EnableDS128(false),
185  EnablePRTStrictNull(false),
186  DumpCode(false),
187 
188  FP64(false),
189  GCN3Encoding(false),
190  CIInsts(false),
191  VIInsts(false),
192  GFX9Insts(false),
193  SGPRInitBug(false),
194  HasSMemRealTime(false),
195  HasIntClamp(false),
196  HasFmaMixInsts(false),
197  HasMovrel(false),
198  HasVGPRIndexMode(false),
199  HasScalarStores(false),
200  HasScalarAtomics(false),
201  HasSDWAOmod(false),
202  HasSDWAScalar(false),
203  HasSDWASdst(false),
204  HasSDWAMac(false),
205  HasSDWAOutModsVOPC(false),
206  HasDPP(false),
207  HasR128A16(false),
208  HasDLInsts(false),
209  HasDot1Insts(false),
210  HasDot2Insts(false),
211  EnableSRAMECC(false),
212  FlatAddressSpace(false),
213  FlatInstOffsets(false),
214  FlatGlobalInsts(false),
215  FlatScratchInsts(false),
216  AddNoCarryInsts(false),
217  HasUnpackedD16VMem(false),
218 
220 
221  FeatureDisable(false),
222  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
223  TLInfo(TM, *this),
224  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
225  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
226  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
227  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
228  InstSelector.reset(new AMDGPUInstructionSelector(
229  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
230 }
231 
233  const Function &F) const {
234  if (NWaves == 1)
235  return getLocalMemorySize();
236  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
237  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
238  unsigned MaxWaves = getMaxWavesPerEU();
239  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
240 }
241 
243  const Function &F) const {
244  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
245  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
246  unsigned MaxWaves = getMaxWavesPerEU();
247  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
248  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
249  NumWaves = std::min(NumWaves, MaxWaves);
250  NumWaves = std::max(NumWaves, 1u);
251  return NumWaves;
252 }
253 
254 unsigned
256  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
257  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
258 }
259 
260 std::pair<unsigned, unsigned>
262  switch (CC) {
266  return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
273  return std::make_pair(1, getWavefrontSize());
274  default:
275  return std::make_pair(1, 16 * getWavefrontSize());
276  }
277 }
278 
279 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
280  const Function &F) const {
281  // FIXME: 1024 if function.
282  // Default minimum/maximum flat work group sizes.
283  std::pair<unsigned, unsigned> Default =
285 
286  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
287  // starts using "amdgpu-flat-work-group-size" attribute.
288  Default.second = AMDGPU::getIntegerAttribute(
289  F, "amdgpu-max-work-group-size", Default.second);
290  Default.first = std::min(Default.first, Default.second);
291 
292  // Requested minimum/maximum flat work group sizes.
293  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
294  F, "amdgpu-flat-work-group-size", Default);
295 
296  // Make sure requested minimum is less than requested maximum.
297  if (Requested.first > Requested.second)
298  return Default;
299 
300  // Make sure requested values do not violate subtarget's specifications.
301  if (Requested.first < getMinFlatWorkGroupSize())
302  return Default;
303  if (Requested.second > getMaxFlatWorkGroupSize())
304  return Default;
305 
306  return Requested;
307 }
308 
309 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
310  const Function &F) const {
311  // Default minimum/maximum number of waves per execution unit.
312  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
313 
314  // Default/requested minimum/maximum flat work group sizes.
315  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
316 
317  // If minimum/maximum flat work group sizes were explicitly requested using
318  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
319  // number of waves per execution unit to values implied by requested
320  // minimum/maximum flat work group sizes.
321  unsigned MinImpliedByFlatWorkGroupSize =
322  getMaxWavesPerEU(FlatWorkGroupSizes.second);
323  bool RequestedFlatWorkGroupSize = false;
324 
325  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
326  // starts using "amdgpu-flat-work-group-size" attribute.
327  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
328  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
329  Default.first = MinImpliedByFlatWorkGroupSize;
330  RequestedFlatWorkGroupSize = true;
331  }
332 
333  // Requested minimum/maximum number of waves per execution unit.
334  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
335  F, "amdgpu-waves-per-eu", Default, true);
336 
337  // Make sure requested minimum is less than requested maximum.
338  if (Requested.second && Requested.first > Requested.second)
339  return Default;
340 
341  // Make sure requested values do not violate subtarget's specifications.
342  if (Requested.first < getMinWavesPerEU() ||
343  Requested.first > getMaxWavesPerEU())
344  return Default;
345  if (Requested.second > getMaxWavesPerEU())
346  return Default;
347 
348  // Make sure requested values are compatible with values implied by requested
349  // minimum/maximum flat work group sizes.
350  if (RequestedFlatWorkGroupSize &&
351  Requested.first < MinImpliedByFlatWorkGroupSize)
352  return Default;
353 
354  return Requested;
355 }
356 
358  Function *Kernel = I->getParent()->getParent();
359  unsigned MinSize = 0;
360  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
361  bool IdQuery = false;
362 
363  // If reqd_work_group_size is present it narrows value down.
364  if (auto *CI = dyn_cast<CallInst>(I)) {
365  const Function *F = CI->getCalledFunction();
366  if (F) {
367  unsigned Dim = UINT_MAX;
368  switch (F->getIntrinsicID()) {
369  case Intrinsic::amdgcn_workitem_id_x:
370  case Intrinsic::r600_read_tidig_x:
371  IdQuery = true;
373  case Intrinsic::r600_read_local_size_x:
374  Dim = 0;
375  break;
376  case Intrinsic::amdgcn_workitem_id_y:
377  case Intrinsic::r600_read_tidig_y:
378  IdQuery = true;
380  case Intrinsic::r600_read_local_size_y:
381  Dim = 1;
382  break;
383  case Intrinsic::amdgcn_workitem_id_z:
384  case Intrinsic::r600_read_tidig_z:
385  IdQuery = true;
387  case Intrinsic::r600_read_local_size_z:
388  Dim = 2;
389  break;
390  default:
391  break;
392  }
393  if (Dim <= 3) {
394  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
395  if (Node->getNumOperands() == 3)
396  MinSize = MaxSize = mdconst::extract<ConstantInt>(
397  Node->getOperand(Dim))->getZExtValue();
398  }
399  }
400  }
401 
402  if (!MaxSize)
403  return false;
404 
405  // Range metadata is [Lo, Hi). For ID query we need to pass max size
406  // as Hi. For size query we need to pass Hi + 1.
407  if (IdQuery)
408  MinSize = 0;
409  else
410  ++MaxSize;
411 
412  MDBuilder MDB(I->getContext());
413  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
414  APInt(32, MaxSize));
415  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
416  return true;
417 }
418 
420  unsigned &MaxAlign) const {
423 
424  const DataLayout &DL = F.getParent()->getDataLayout();
425  uint64_t ExplicitArgBytes = 0;
426  MaxAlign = 1;
427 
428  for (const Argument &Arg : F.args()) {
429  Type *ArgTy = Arg.getType();
430 
431  unsigned Align = DL.getABITypeAlignment(ArgTy);
432  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
433  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
434  MaxAlign = std::max(MaxAlign, Align);
435  }
436 
437  return ExplicitArgBytes;
438 }
439 
441  unsigned &MaxAlign) const {
442  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
443 
444  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
445 
446  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
447  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
448  if (ImplicitBytes != 0) {
449  unsigned Alignment = getAlignmentForImplicitArgPtr();
450  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
451  }
452 
453  // Being able to dereference past the end is useful for emitting scalar loads.
454  return alignTo(TotalSize, 4);
455 }
456 
458  const TargetMachine &TM) :
459  R600GenSubtargetInfo(TT, GPU, FS),
460  AMDGPUSubtarget(TT),
461  InstrInfo(*this),
462  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
463  FMA(false),
464  CaymanISA(false),
465  CFALUBug(false),
466  DX10Clamp(false),
469  FP64(false),
470  TexVTXClauseSize(0),
471  Gen(R600),
472  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
473  InstrItins(getInstrItineraryForCPU(GPU)) { }
474 
476  unsigned NumRegionInstrs) const {
477  // Track register pressure so the scheduler can try to decrease
478  // pressure once register usage is above the threshold defined by
479  // SIRegisterInfo::getRegPressureSetLimit()
480  Policy.ShouldTrackPressure = true;
481 
482  // Enabling both top down and bottom up scheduling seems to give us less
483  // register spills than just using one of these approaches on its own.
484  Policy.OnlyTopDown = false;
485  Policy.OnlyBottomUp = false;
486 
487  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
488  if (!enableSIScheduler())
489  Policy.ShouldTrackLaneMasks = true;
490 }
491 
492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
494  if (SGPRs <= 80)
495  return 10;
496  if (SGPRs <= 88)
497  return 9;
498  if (SGPRs <= 100)
499  return 8;
500  return 7;
501  }
502  if (SGPRs <= 48)
503  return 10;
504  if (SGPRs <= 56)
505  return 9;
506  if (SGPRs <= 64)
507  return 8;
508  if (SGPRs <= 72)
509  return 7;
510  if (SGPRs <= 80)
511  return 6;
512  return 5;
513 }
514 
515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
516  if (VGPRs <= 24)
517  return 10;
518  if (VGPRs <= 28)
519  return 9;
520  if (VGPRs <= 32)
521  return 8;
522  if (VGPRs <= 36)
523  return 7;
524  if (VGPRs <= 40)
525  return 6;
526  if (VGPRs <= 48)
527  return 5;
528  if (VGPRs <= 64)
529  return 4;
530  if (VGPRs <= 84)
531  return 3;
532  if (VGPRs <= 128)
533  return 2;
534  return 1;
535 }
536 
539  if (MFI.hasFlatScratchInit()) {
541  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
543  return 4; // FLAT_SCRATCH, VCC (in that order).
544  }
545 
546  if (isXNACKEnabled())
547  return 4; // XNACK, VCC (in that order).
548  return 2; // VCC.
549 }
550 
552  const Function &F = MF.getFunction();
554 
555  // Compute maximum number of SGPRs function can use using default/requested
556  // minimum number of waves per execution unit.
557  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
558  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
559  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
560 
561  // Check if maximum number of SGPRs was explicitly requested using
562  // "amdgpu-num-sgpr" attribute.
563  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
564  unsigned Requested = AMDGPU::getIntegerAttribute(
565  F, "amdgpu-num-sgpr", MaxNumSGPRs);
566 
567  // Make sure requested value does not violate subtarget's specifications.
568  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
569  Requested = 0;
570 
571  // If more SGPRs are required to support the input user/system SGPRs,
572  // increase to accommodate them.
573  //
574  // FIXME: This really ends up using the requested number of SGPRs + number
575  // of reserved special registers in total. Theoretically you could re-use
576  // the last input registers for these special registers, but this would
577  // require a lot of complexity to deal with the weird aliasing.
578  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
579  if (Requested && Requested < InputNumSGPRs)
580  Requested = InputNumSGPRs;
581 
582  // Make sure requested value is compatible with values implied by
583  // default/requested minimum/maximum number of waves per execution unit.
584  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
585  Requested = 0;
586  if (WavesPerEU.second &&
587  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
588  Requested = 0;
589 
590  if (Requested)
591  MaxNumSGPRs = Requested;
592  }
593 
594  if (hasSGPRInitBug())
596 
597  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
598  MaxAddressableNumSGPRs);
599 }
600 
602  const Function &F = MF.getFunction();
604 
605  // Compute maximum number of VGPRs function can use using default/requested
606  // minimum number of waves per execution unit.
607  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
608  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
609 
610  // Check if maximum number of VGPRs was explicitly requested using
611  // "amdgpu-num-vgpr" attribute.
612  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
613  unsigned Requested = AMDGPU::getIntegerAttribute(
614  F, "amdgpu-num-vgpr", MaxNumVGPRs);
615 
616  // Make sure requested value is compatible with values implied by
617  // default/requested minimum/maximum number of waves per execution unit.
618  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
619  Requested = 0;
620  if (WavesPerEU.second &&
621  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
622  Requested = 0;
623 
624  if (Requested)
625  MaxNumVGPRs = Requested;
626  }
627 
628  return MaxNumVGPRs;
629 }
630 
631 namespace {
632 struct MemOpClusterMutation : ScheduleDAGMutation {
633  const SIInstrInfo *TII;
634 
635  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
636 
637  void apply(ScheduleDAGInstrs *DAGInstrs) override {
638  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
639 
640  SUnit *SUa = nullptr;
641  // Search for two consequent memory operations and link them
642  // to prevent scheduler from moving them apart.
643  // In DAG pre-process SUnits are in the original order of
644  // the instructions before scheduling.
645  for (SUnit &SU : DAG->SUnits) {
646  MachineInstr &MI2 = *SU.getInstr();
647  if (!MI2.mayLoad() && !MI2.mayStore()) {
648  SUa = nullptr;
649  continue;
650  }
651  if (!SUa) {
652  SUa = &SU;
653  continue;
654  }
655 
656  MachineInstr &MI1 = *SUa->getInstr();
657  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
658  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
659  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
660  (TII->isDS(MI1) && TII->isDS(MI2))) {
661  SU.addPredBarrier(SUa);
662 
663  for (const SDep &SI : SU.Preds) {
664  if (SI.getSUnit() != SUa)
665  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
666  }
667 
668  if (&SU != &DAG->ExitSU) {
669  for (const SDep &SI : SUa->Succs) {
670  if (SI.getSUnit() != &SU)
671  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
672  }
673  }
674  }
675 
676  SUa = &SU;
677  }
678  }
679 };
680 } // namespace
681 
683  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
684  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
685 }
686 
689  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
690  else
691  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
692 }
693 
695  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
696  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
697  else
698  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
699 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:320
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:434
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:444
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:470
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
const HexagonInstrInfo * TII
int getLocalMemorySize() const
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1443
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1185
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:289
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:819
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:432
The AMDGPU TargetMachine interface definition for hw codgen targets.
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1225
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Information about stack frame layout on the target.
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:69
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:63
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPUSubtarget(const Triple &TT)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:330
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:250
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:58
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
iterator_range< arg_iterator > args()
Definition: Function.h:688
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66
const SIRegisterInfo * getRegisterInfo() const override