LLVM  9.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
47  StringRef GPU, StringRef FS) {
48  SmallString<256> FullFS("+promote-alloca,");
49  FullFS += FS;
50  ParseSubtargetFeatures(GPU, FullFS);
51 
52  // FIXME: I don't think think Evergreen has any useful support for
53  // denormals, but should be checked. Should we issue a warning somewhere
54  // if someone tries to enable these?
56  FP32Denormals = false;
57  }
58 
61 
62  return *this;
63 }
64 
67  StringRef GPU, StringRef FS) {
68  // Determine default and user-specified characteristics
69  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70  // enabled, but some instructions do not respect them and they run at the
71  // double precision rate, so don't enable by default.
72  //
73  // We want to be able to turn these off, but making this a subtarget feature
74  // for SI has the unhelpful behavior that it unsets everything else if you
75  // disable it.
76  //
77  // Similarly we want enable-prt-strict-null to be on by default and not to
78  // unset everything else if it is disabled
79 
80  // Assuming ECC is enabled is the conservative default.
81  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,");
82 
83  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84  FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86  // FIXME: I don't think think Evergreen has any useful support for
87  // denormals, but should be checked. Should we issue a warning somewhere
88  // if someone tries to enable these?
90  FullFS += "+fp64-fp16-denormals,";
91  } else {
92  FullFS += "-fp32-denormals,";
93  }
94 
95  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97  FullFS += FS;
98 
99  ParseSubtargetFeatures(GPU, FullFS);
100 
101  // We don't support FP64 for EG/NI atm.
103 
104  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106  // variants of MUBUF instructions.
107  if (!hasAddr64() && !FS.contains("flat-for-global")) {
108  FlatForGlobal = true;
109  }
110 
111  // Set defaults if needed.
112  if (MaxPrivateElementSize == 0)
113  MaxPrivateElementSize = 4;
114 
115  if (LDSBankCount == 0)
116  LDSBankCount = 32;
117 
118  if (TT.getArch() == Triple::amdgcn) {
119  if (LocalMemorySize == 0)
120  LocalMemorySize = 32768;
121 
122  // Do something sensible for unspecified target.
123  if (!HasMovrel && !HasVGPRIndexMode)
124  HasMovrel = true;
125  }
126 
127  // Don't crash on invalid devices.
128  if (WavefrontSize == 0)
129  WavefrontSize = 64;
130 
132 
133  // ECC is on by default, but turn it off if the hardware doesn't support it
134  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
135  // ECC.
136  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
137  ToggleFeature(AMDGPU::FeatureSRAMECC);
138  EnableSRAMECC = false;
139  }
140 
141  return *this;
142 }
143 
145  TargetTriple(TT),
150  HasSDWA(false),
152  HasMulI24(true),
153  HasMulU24(true),
158  LocalMemorySize(0),
159  WavefrontSize(0)
160  { }
161 
163  const GCNTargetMachine &TM) :
164  AMDGPUGenSubtargetInfo(TT, GPU, FS),
165  AMDGPUSubtarget(TT),
166  TargetTriple(TT),
167  Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
168  InstrItins(getInstrItineraryForCPU(GPU)),
169  LDSBankCount(0),
170  MaxPrivateElementSize(0),
171 
172  FastFMAF32(false),
173  HalfRate64Ops(false),
174 
175  FP64FP16Denormals(false),
176  FlatForGlobal(false),
177  AutoWaitcntBeforeBarrier(false),
178  CodeObjectV3(false),
179  UnalignedScratchAccess(false),
180  UnalignedBufferAccess(false),
181 
182  HasApertureRegs(false),
183  EnableXNACK(false),
184  TrapHandler(false),
185 
186  EnableHugePrivateBuffer(false),
188  EnableUnsafeDSOffsetFolding(false),
189  EnableSIScheduler(false),
190  EnableDS128(false),
191  EnablePRTStrictNull(false),
192  DumpCode(false),
193 
194  FP64(false),
195  GCN3Encoding(false),
196  CIInsts(false),
197  GFX8Insts(false),
198  GFX9Insts(false),
199  GFX7GFX8GFX9Insts(false),
200  SGPRInitBug(false),
201  HasSMemRealTime(false),
202  HasIntClamp(false),
203  HasFmaMixInsts(false),
204  HasMovrel(false),
205  HasVGPRIndexMode(false),
206  HasScalarStores(false),
207  HasScalarAtomics(false),
208  HasSDWAOmod(false),
209  HasSDWAScalar(false),
210  HasSDWASdst(false),
211  HasSDWAMac(false),
212  HasSDWAOutModsVOPC(false),
213  HasDPP(false),
214  HasR128A16(false),
215  HasDLInsts(false),
216  HasDot1Insts(false),
217  HasDot2Insts(false),
218  EnableSRAMECC(false),
219  DoesNotSupportSRAMECC(false),
220  FlatAddressSpace(false),
221  FlatInstOffsets(false),
222  FlatGlobalInsts(false),
223  FlatScratchInsts(false),
224  AddNoCarryInsts(false),
225  HasUnpackedD16VMem(false),
226 
228 
229  FeatureDisable(false),
230  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
231  TLInfo(TM, *this),
232  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
233  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
234  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
235  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
236  InstSelector.reset(new AMDGPUInstructionSelector(
237  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
238 }
239 
241  const Function &F) const {
242  if (NWaves == 1)
243  return getLocalMemorySize();
244  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
245  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
246  unsigned MaxWaves = getMaxWavesPerEU();
247  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
248 }
249 
251  const Function &F) const {
252  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
253  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
254  unsigned MaxWaves = getMaxWavesPerEU();
255  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
256  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
257  NumWaves = std::min(NumWaves, MaxWaves);
258  NumWaves = std::max(NumWaves, 1u);
259  return NumWaves;
260 }
261 
262 unsigned
264  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
265  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
266 }
267 
268 std::pair<unsigned, unsigned>
270  switch (CC) {
274  return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
281  return std::make_pair(1, getWavefrontSize());
282  default:
283  return std::make_pair(1, 16 * getWavefrontSize());
284  }
285 }
286 
287 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
288  const Function &F) const {
289  // FIXME: 1024 if function.
290  // Default minimum/maximum flat work group sizes.
291  std::pair<unsigned, unsigned> Default =
293 
294  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
295  // starts using "amdgpu-flat-work-group-size" attribute.
296  Default.second = AMDGPU::getIntegerAttribute(
297  F, "amdgpu-max-work-group-size", Default.second);
298  Default.first = std::min(Default.first, Default.second);
299 
300  // Requested minimum/maximum flat work group sizes.
301  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
302  F, "amdgpu-flat-work-group-size", Default);
303 
304  // Make sure requested minimum is less than requested maximum.
305  if (Requested.first > Requested.second)
306  return Default;
307 
308  // Make sure requested values do not violate subtarget's specifications.
309  if (Requested.first < getMinFlatWorkGroupSize())
310  return Default;
311  if (Requested.second > getMaxFlatWorkGroupSize())
312  return Default;
313 
314  return Requested;
315 }
316 
317 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
318  const Function &F) const {
319  // Default minimum/maximum number of waves per execution unit.
320  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
321 
322  // Default/requested minimum/maximum flat work group sizes.
323  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
324 
325  // If minimum/maximum flat work group sizes were explicitly requested using
326  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
327  // number of waves per execution unit to values implied by requested
328  // minimum/maximum flat work group sizes.
329  unsigned MinImpliedByFlatWorkGroupSize =
330  getMaxWavesPerEU(FlatWorkGroupSizes.second);
331  bool RequestedFlatWorkGroupSize = false;
332 
333  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
334  // starts using "amdgpu-flat-work-group-size" attribute.
335  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
336  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
337  Default.first = MinImpliedByFlatWorkGroupSize;
338  RequestedFlatWorkGroupSize = true;
339  }
340 
341  // Requested minimum/maximum number of waves per execution unit.
342  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
343  F, "amdgpu-waves-per-eu", Default, true);
344 
345  // Make sure requested minimum is less than requested maximum.
346  if (Requested.second && Requested.first > Requested.second)
347  return Default;
348 
349  // Make sure requested values do not violate subtarget's specifications.
350  if (Requested.first < getMinWavesPerEU() ||
351  Requested.first > getMaxWavesPerEU())
352  return Default;
353  if (Requested.second > getMaxWavesPerEU())
354  return Default;
355 
356  // Make sure requested values are compatible with values implied by requested
357  // minimum/maximum flat work group sizes.
358  if (RequestedFlatWorkGroupSize &&
359  Requested.first < MinImpliedByFlatWorkGroupSize)
360  return Default;
361 
362  return Requested;
363 }
364 
366  Function *Kernel = I->getParent()->getParent();
367  unsigned MinSize = 0;
368  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
369  bool IdQuery = false;
370 
371  // If reqd_work_group_size is present it narrows value down.
372  if (auto *CI = dyn_cast<CallInst>(I)) {
373  const Function *F = CI->getCalledFunction();
374  if (F) {
375  unsigned Dim = UINT_MAX;
376  switch (F->getIntrinsicID()) {
377  case Intrinsic::amdgcn_workitem_id_x:
378  case Intrinsic::r600_read_tidig_x:
379  IdQuery = true;
381  case Intrinsic::r600_read_local_size_x:
382  Dim = 0;
383  break;
384  case Intrinsic::amdgcn_workitem_id_y:
385  case Intrinsic::r600_read_tidig_y:
386  IdQuery = true;
388  case Intrinsic::r600_read_local_size_y:
389  Dim = 1;
390  break;
391  case Intrinsic::amdgcn_workitem_id_z:
392  case Intrinsic::r600_read_tidig_z:
393  IdQuery = true;
395  case Intrinsic::r600_read_local_size_z:
396  Dim = 2;
397  break;
398  default:
399  break;
400  }
401  if (Dim <= 3) {
402  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
403  if (Node->getNumOperands() == 3)
404  MinSize = MaxSize = mdconst::extract<ConstantInt>(
405  Node->getOperand(Dim))->getZExtValue();
406  }
407  }
408  }
409 
410  if (!MaxSize)
411  return false;
412 
413  // Range metadata is [Lo, Hi). For ID query we need to pass max size
414  // as Hi. For size query we need to pass Hi + 1.
415  if (IdQuery)
416  MinSize = 0;
417  else
418  ++MaxSize;
419 
420  MDBuilder MDB(I->getContext());
421  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
422  APInt(32, MaxSize));
423  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
424  return true;
425 }
426 
428  unsigned &MaxAlign) const {
431 
432  const DataLayout &DL = F.getParent()->getDataLayout();
433  uint64_t ExplicitArgBytes = 0;
434  MaxAlign = 1;
435 
436  for (const Argument &Arg : F.args()) {
437  Type *ArgTy = Arg.getType();
438 
439  unsigned Align = DL.getABITypeAlignment(ArgTy);
440  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
441  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
442  MaxAlign = std::max(MaxAlign, Align);
443  }
444 
445  return ExplicitArgBytes;
446 }
447 
449  unsigned &MaxAlign) const {
450  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
451 
452  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
453 
454  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
455  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
456  if (ImplicitBytes != 0) {
457  unsigned Alignment = getAlignmentForImplicitArgPtr();
458  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
459  }
460 
461  // Being able to dereference past the end is useful for emitting scalar loads.
462  return alignTo(TotalSize, 4);
463 }
464 
466  const TargetMachine &TM) :
467  R600GenSubtargetInfo(TT, GPU, FS),
468  AMDGPUSubtarget(TT),
469  InstrInfo(*this),
470  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
471  FMA(false),
472  CaymanISA(false),
473  CFALUBug(false),
476  FP64(false),
477  TexVTXClauseSize(0),
478  Gen(R600),
479  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
480  InstrItins(getInstrItineraryForCPU(GPU)) { }
481 
483  unsigned NumRegionInstrs) const {
484  // Track register pressure so the scheduler can try to decrease
485  // pressure once register usage is above the threshold defined by
486  // SIRegisterInfo::getRegPressureSetLimit()
487  Policy.ShouldTrackPressure = true;
488 
489  // Enabling both top down and bottom up scheduling seems to give us less
490  // register spills than just using one of these approaches on its own.
491  Policy.OnlyTopDown = false;
492  Policy.OnlyBottomUp = false;
493 
494  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
495  if (!enableSIScheduler())
496  Policy.ShouldTrackLaneMasks = true;
497 }
498 
499 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
501  if (SGPRs <= 80)
502  return 10;
503  if (SGPRs <= 88)
504  return 9;
505  if (SGPRs <= 100)
506  return 8;
507  return 7;
508  }
509  if (SGPRs <= 48)
510  return 10;
511  if (SGPRs <= 56)
512  return 9;
513  if (SGPRs <= 64)
514  return 8;
515  if (SGPRs <= 72)
516  return 7;
517  if (SGPRs <= 80)
518  return 6;
519  return 5;
520 }
521 
522 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
523  if (VGPRs <= 24)
524  return 10;
525  if (VGPRs <= 28)
526  return 9;
527  if (VGPRs <= 32)
528  return 8;
529  if (VGPRs <= 36)
530  return 7;
531  if (VGPRs <= 40)
532  return 6;
533  if (VGPRs <= 48)
534  return 5;
535  if (VGPRs <= 64)
536  return 4;
537  if (VGPRs <= 84)
538  return 3;
539  if (VGPRs <= 128)
540  return 2;
541  return 1;
542 }
543 
546  if (MFI.hasFlatScratchInit()) {
548  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
550  return 4; // FLAT_SCRATCH, VCC (in that order).
551  }
552 
553  if (isXNACKEnabled())
554  return 4; // XNACK, VCC (in that order).
555  return 2; // VCC.
556 }
557 
559  const Function &F = MF.getFunction();
561 
562  // Compute maximum number of SGPRs function can use using default/requested
563  // minimum number of waves per execution unit.
564  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
565  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
566  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
567 
568  // Check if maximum number of SGPRs was explicitly requested using
569  // "amdgpu-num-sgpr" attribute.
570  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
571  unsigned Requested = AMDGPU::getIntegerAttribute(
572  F, "amdgpu-num-sgpr", MaxNumSGPRs);
573 
574  // Make sure requested value does not violate subtarget's specifications.
575  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
576  Requested = 0;
577 
578  // If more SGPRs are required to support the input user/system SGPRs,
579  // increase to accommodate them.
580  //
581  // FIXME: This really ends up using the requested number of SGPRs + number
582  // of reserved special registers in total. Theoretically you could re-use
583  // the last input registers for these special registers, but this would
584  // require a lot of complexity to deal with the weird aliasing.
585  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
586  if (Requested && Requested < InputNumSGPRs)
587  Requested = InputNumSGPRs;
588 
589  // Make sure requested value is compatible with values implied by
590  // default/requested minimum/maximum number of waves per execution unit.
591  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
592  Requested = 0;
593  if (WavesPerEU.second &&
594  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
595  Requested = 0;
596 
597  if (Requested)
598  MaxNumSGPRs = Requested;
599  }
600 
601  if (hasSGPRInitBug())
603 
604  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
605  MaxAddressableNumSGPRs);
606 }
607 
609  const Function &F = MF.getFunction();
611 
612  // Compute maximum number of VGPRs function can use using default/requested
613  // minimum number of waves per execution unit.
614  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
615  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
616 
617  // Check if maximum number of VGPRs was explicitly requested using
618  // "amdgpu-num-vgpr" attribute.
619  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
620  unsigned Requested = AMDGPU::getIntegerAttribute(
621  F, "amdgpu-num-vgpr", MaxNumVGPRs);
622 
623  // Make sure requested value is compatible with values implied by
624  // default/requested minimum/maximum number of waves per execution unit.
625  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
626  Requested = 0;
627  if (WavesPerEU.second &&
628  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
629  Requested = 0;
630 
631  if (Requested)
632  MaxNumVGPRs = Requested;
633  }
634 
635  return MaxNumVGPRs;
636 }
637 
638 namespace {
639 struct MemOpClusterMutation : ScheduleDAGMutation {
640  const SIInstrInfo *TII;
641 
642  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
643 
644  void apply(ScheduleDAGInstrs *DAG) override {
645  SUnit *SUa = nullptr;
646  // Search for two consequent memory operations and link them
647  // to prevent scheduler from moving them apart.
648  // In DAG pre-process SUnits are in the original order of
649  // the instructions before scheduling.
650  for (SUnit &SU : DAG->SUnits) {
651  MachineInstr &MI2 = *SU.getInstr();
652  if (!MI2.mayLoad() && !MI2.mayStore()) {
653  SUa = nullptr;
654  continue;
655  }
656  if (!SUa) {
657  SUa = &SU;
658  continue;
659  }
660 
661  MachineInstr &MI1 = *SUa->getInstr();
662  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
663  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
664  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
665  (TII->isDS(MI1) && TII->isDS(MI2))) {
666  SU.addPredBarrier(SUa);
667 
668  for (const SDep &SI : SU.Preds) {
669  if (SI.getSUnit() != SUa)
670  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
671  }
672 
673  if (&SU != &DAG->ExitSU) {
674  for (const SDep &SI : SUa->Succs) {
675  if (SI.getSUnit() != &SU)
676  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
677  }
678  }
679  }
680 
681  SUa = &SU;
682  }
683  }
684 };
685 } // namespace
686 
688  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
689  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
690 }
691 
694  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
695  else
696  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
697 }
698 
700  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
701  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
702  else
703  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
704 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:709
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:320
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:438
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:474
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
const HexagonInstrInfo * TII
int getLocalMemorySize() const
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1443
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1190
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:291
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:819
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:432
The AMDGPU TargetMachine interface definition for hw codgen targets.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1225
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Information about stack frame layout on the target.
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:69
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:63
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPUSubtarget(const Triple &TT)
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:334
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:250
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
iterator_range< arg_iterator > args()
Definition: Function.h:691
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66
const SIRegisterInfo * getRegisterInfo() const override