LLVM  8.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
24 #include "llvm/ADT/SmallString.h"
27 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
48  StringRef GPU, StringRef FS) {
49  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50  FullFS += FS;
51  ParseSubtargetFeatures(GPU, FullFS);
52 
53  // FIXME: I don't think think Evergreen has any useful support for
54  // denormals, but should be checked. Should we issue a warning somewhere
55  // if someone tries to enable these?
57  FP32Denormals = false;
58  }
59 
62 
63  return *this;
64 }
65 
68  StringRef GPU, StringRef FS) {
69  // Determine default and user-specified characteristics
70  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71  // enabled, but some instructions do not respect them and they run at the
72  // double precision rate, so don't enable by default.
73  //
74  // We want to be able to turn these off, but making this a subtarget feature
75  // for SI has the unhelpful behavior that it unsets everything else if you
76  // disable it.
77 
78  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 
80  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81  FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 
83  // FIXME: I don't think think Evergreen has any useful support for
84  // denormals, but should be checked. Should we issue a warning somewhere
85  // if someone tries to enable these?
87  FullFS += "+fp64-fp16-denormals,";
88  } else {
89  FullFS += "-fp32-denormals,";
90  }
91 
92  FullFS += FS;
93 
94  ParseSubtargetFeatures(GPU, FullFS);
95 
96  // We don't support FP64 for EG/NI atm.
98 
99  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101  // variants of MUBUF instructions.
102  if (!hasAddr64() && !FS.contains("flat-for-global")) {
103  FlatForGlobal = true;
104  }
105 
106  // Set defaults if needed.
107  if (MaxPrivateElementSize == 0)
108  MaxPrivateElementSize = 4;
109 
110  if (LDSBankCount == 0)
111  LDSBankCount = 32;
112 
113  if (TT.getArch() == Triple::amdgcn) {
114  if (LocalMemorySize == 0)
115  LocalMemorySize = 32768;
116 
117  // Do something sensible for unspecified target.
118  if (!HasMovrel && !HasVGPRIndexMode)
119  HasMovrel = true;
120  }
121 
123 
124  return *this;
125 }
126 
128  TargetTriple(TT),
133  HasSDWA(false),
135  HasMulI24(true),
136  HasMulU24(true),
141  LocalMemorySize(0),
142  WavefrontSize(0)
143  { }
144 
146  const GCNTargetMachine &TM) :
147  AMDGPUGenSubtargetInfo(TT, GPU, FS),
148  AMDGPUSubtarget(TT),
149  TargetTriple(TT),
150  Gen(SOUTHERN_ISLANDS),
151  IsaVersion(ISAVersion0_0_0),
152  InstrItins(getInstrItineraryForCPU(GPU)),
153  LDSBankCount(0),
154  MaxPrivateElementSize(0),
155 
156  FastFMAF32(false),
157  HalfRate64Ops(false),
158 
159  FP64FP16Denormals(false),
160  DX10Clamp(false),
161  FlatForGlobal(false),
162  AutoWaitcntBeforeBarrier(false),
163  CodeObjectV3(false),
164  UnalignedScratchAccess(false),
165  UnalignedBufferAccess(false),
166 
167  HasApertureRegs(false),
168  EnableXNACK(false),
169  TrapHandler(false),
170  DebuggerInsertNops(false),
171  DebuggerEmitPrologue(false),
172 
173  EnableHugePrivateBuffer(false),
175  EnableUnsafeDSOffsetFolding(false),
176  EnableSIScheduler(false),
177  EnableDS128(false),
178  DumpCode(false),
179 
180  FP64(false),
181  GCN3Encoding(false),
182  CIInsts(false),
183  VIInsts(false),
184  GFX9Insts(false),
185  SGPRInitBug(false),
186  HasSMemRealTime(false),
187  HasIntClamp(false),
188  HasFmaMixInsts(false),
189  HasMovrel(false),
190  HasVGPRIndexMode(false),
191  HasScalarStores(false),
192  HasScalarAtomics(false),
193  HasSDWAOmod(false),
194  HasSDWAScalar(false),
195  HasSDWASdst(false),
196  HasSDWAMac(false),
197  HasSDWAOutModsVOPC(false),
198  HasDPP(false),
199  HasR128A16(false),
200  HasDLInsts(false),
201  EnableSRAMECC(false),
202  FlatAddressSpace(false),
203  FlatInstOffsets(false),
204  FlatGlobalInsts(false),
205  FlatScratchInsts(false),
206  AddNoCarryInsts(false),
207  HasUnpackedD16VMem(false),
208 
210 
211  FeatureDisable(false),
212  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
213  TLInfo(TM, *this),
214  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
215  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
216  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
217  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
218  InstSelector.reset(new AMDGPUInstructionSelector(
219  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
220 }
221 
223  const Function &F) const {
224  if (NWaves == 1)
225  return getLocalMemorySize();
226  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
227  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
228  unsigned MaxWaves = getMaxWavesPerEU();
229  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
230 }
231 
233  const Function &F) const {
234  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
235  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
236  unsigned MaxWaves = getMaxWavesPerEU();
237  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
238  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
239  NumWaves = std::min(NumWaves, MaxWaves);
240  NumWaves = std::max(NumWaves, 1u);
241  return NumWaves;
242 }
243 
244 unsigned
246  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
247  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
248 }
249 
250 std::pair<unsigned, unsigned>
252  switch (CC) {
256  return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
263  return std::make_pair(1, getWavefrontSize());
264  default:
265  return std::make_pair(1, 16 * getWavefrontSize());
266  }
267 }
268 
269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
270  const Function &F) const {
271  // FIXME: 1024 if function.
272  // Default minimum/maximum flat work group sizes.
273  std::pair<unsigned, unsigned> Default =
275 
276  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
277  // starts using "amdgpu-flat-work-group-size" attribute.
278  Default.second = AMDGPU::getIntegerAttribute(
279  F, "amdgpu-max-work-group-size", Default.second);
280  Default.first = std::min(Default.first, Default.second);
281 
282  // Requested minimum/maximum flat work group sizes.
283  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
284  F, "amdgpu-flat-work-group-size", Default);
285 
286  // Make sure requested minimum is less than requested maximum.
287  if (Requested.first > Requested.second)
288  return Default;
289 
290  // Make sure requested values do not violate subtarget's specifications.
291  if (Requested.first < getMinFlatWorkGroupSize())
292  return Default;
293  if (Requested.second > getMaxFlatWorkGroupSize())
294  return Default;
295 
296  return Requested;
297 }
298 
299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
300  const Function &F) const {
301  // Default minimum/maximum number of waves per execution unit.
302  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
303 
304  // Default/requested minimum/maximum flat work group sizes.
305  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
306 
307  // If minimum/maximum flat work group sizes were explicitly requested using
308  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
309  // number of waves per execution unit to values implied by requested
310  // minimum/maximum flat work group sizes.
311  unsigned MinImpliedByFlatWorkGroupSize =
312  getMaxWavesPerEU(FlatWorkGroupSizes.second);
313  bool RequestedFlatWorkGroupSize = false;
314 
315  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
316  // starts using "amdgpu-flat-work-group-size" attribute.
317  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
318  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
319  Default.first = MinImpliedByFlatWorkGroupSize;
320  RequestedFlatWorkGroupSize = true;
321  }
322 
323  // Requested minimum/maximum number of waves per execution unit.
324  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
325  F, "amdgpu-waves-per-eu", Default, true);
326 
327  // Make sure requested minimum is less than requested maximum.
328  if (Requested.second && Requested.first > Requested.second)
329  return Default;
330 
331  // Make sure requested values do not violate subtarget's specifications.
332  if (Requested.first < getMinWavesPerEU() ||
333  Requested.first > getMaxWavesPerEU())
334  return Default;
335  if (Requested.second > getMaxWavesPerEU())
336  return Default;
337 
338  // Make sure requested values are compatible with values implied by requested
339  // minimum/maximum flat work group sizes.
340  if (RequestedFlatWorkGroupSize &&
341  Requested.first < MinImpliedByFlatWorkGroupSize)
342  return Default;
343 
344  return Requested;
345 }
346 
348  Function *Kernel = I->getParent()->getParent();
349  unsigned MinSize = 0;
350  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
351  bool IdQuery = false;
352 
353  // If reqd_work_group_size is present it narrows value down.
354  if (auto *CI = dyn_cast<CallInst>(I)) {
355  const Function *F = CI->getCalledFunction();
356  if (F) {
357  unsigned Dim = UINT_MAX;
358  switch (F->getIntrinsicID()) {
359  case Intrinsic::amdgcn_workitem_id_x:
360  case Intrinsic::r600_read_tidig_x:
361  IdQuery = true;
363  case Intrinsic::r600_read_local_size_x:
364  Dim = 0;
365  break;
366  case Intrinsic::amdgcn_workitem_id_y:
367  case Intrinsic::r600_read_tidig_y:
368  IdQuery = true;
370  case Intrinsic::r600_read_local_size_y:
371  Dim = 1;
372  break;
373  case Intrinsic::amdgcn_workitem_id_z:
374  case Intrinsic::r600_read_tidig_z:
375  IdQuery = true;
377  case Intrinsic::r600_read_local_size_z:
378  Dim = 2;
379  break;
380  default:
381  break;
382  }
383  if (Dim <= 3) {
384  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
385  if (Node->getNumOperands() == 3)
386  MinSize = MaxSize = mdconst::extract<ConstantInt>(
387  Node->getOperand(Dim))->getZExtValue();
388  }
389  }
390  }
391 
392  if (!MaxSize)
393  return false;
394 
395  // Range metadata is [Lo, Hi). For ID query we need to pass max size
396  // as Hi. For size query we need to pass Hi + 1.
397  if (IdQuery)
398  MinSize = 0;
399  else
400  ++MaxSize;
401 
402  MDBuilder MDB(I->getContext());
403  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
404  APInt(32, MaxSize));
405  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
406  return true;
407 }
408 
410  unsigned &MaxAlign) const {
413 
414  const DataLayout &DL = F.getParent()->getDataLayout();
415  uint64_t ExplicitArgBytes = 0;
416  MaxAlign = 1;
417 
418  for (const Argument &Arg : F.args()) {
419  Type *ArgTy = Arg.getType();
420 
421  unsigned Align = DL.getABITypeAlignment(ArgTy);
422  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
423  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
424  MaxAlign = std::max(MaxAlign, Align);
425  }
426 
427  return ExplicitArgBytes;
428 }
429 
431  unsigned &MaxAlign) const {
432  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
433 
434  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
435 
436  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
437  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
438  if (ImplicitBytes != 0) {
439  unsigned Alignment = getAlignmentForImplicitArgPtr();
440  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
441  }
442 
443  // Being able to dereference past the end is useful for emitting scalar loads.
444  return alignTo(TotalSize, 4);
445 }
446 
448  const TargetMachine &TM) :
449  R600GenSubtargetInfo(TT, GPU, FS),
450  AMDGPUSubtarget(TT),
451  InstrInfo(*this),
452  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
453  FMA(false),
454  CaymanISA(false),
455  CFALUBug(false),
456  DX10Clamp(false),
459  FP64(false),
460  TexVTXClauseSize(0),
461  Gen(R600),
462  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
463  InstrItins(getInstrItineraryForCPU(GPU)) { }
464 
466  unsigned NumRegionInstrs) const {
467  // Track register pressure so the scheduler can try to decrease
468  // pressure once register usage is above the threshold defined by
469  // SIRegisterInfo::getRegPressureSetLimit()
470  Policy.ShouldTrackPressure = true;
471 
472  // Enabling both top down and bottom up scheduling seems to give us less
473  // register spills than just using one of these approaches on its own.
474  Policy.OnlyTopDown = false;
475  Policy.OnlyBottomUp = false;
476 
477  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
478  if (!enableSIScheduler())
479  Policy.ShouldTrackLaneMasks = true;
480 }
481 
482 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
484  if (SGPRs <= 80)
485  return 10;
486  if (SGPRs <= 88)
487  return 9;
488  if (SGPRs <= 100)
489  return 8;
490  return 7;
491  }
492  if (SGPRs <= 48)
493  return 10;
494  if (SGPRs <= 56)
495  return 9;
496  if (SGPRs <= 64)
497  return 8;
498  if (SGPRs <= 72)
499  return 7;
500  if (SGPRs <= 80)
501  return 6;
502  return 5;
503 }
504 
505 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
506  if (VGPRs <= 24)
507  return 10;
508  if (VGPRs <= 28)
509  return 9;
510  if (VGPRs <= 32)
511  return 8;
512  if (VGPRs <= 36)
513  return 7;
514  if (VGPRs <= 40)
515  return 6;
516  if (VGPRs <= 48)
517  return 5;
518  if (VGPRs <= 64)
519  return 4;
520  if (VGPRs <= 84)
521  return 3;
522  if (VGPRs <= 128)
523  return 2;
524  return 1;
525 }
526 
529  if (MFI.hasFlatScratchInit()) {
531  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
533  return 4; // FLAT_SCRATCH, VCC (in that order).
534  }
535 
536  if (isXNACKEnabled())
537  return 4; // XNACK, VCC (in that order).
538  return 2; // VCC.
539 }
540 
542  const Function &F = MF.getFunction();
544 
545  // Compute maximum number of SGPRs function can use using default/requested
546  // minimum number of waves per execution unit.
547  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
548  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
549  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
550 
551  // Check if maximum number of SGPRs was explicitly requested using
552  // "amdgpu-num-sgpr" attribute.
553  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
554  unsigned Requested = AMDGPU::getIntegerAttribute(
555  F, "amdgpu-num-sgpr", MaxNumSGPRs);
556 
557  // Make sure requested value does not violate subtarget's specifications.
558  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
559  Requested = 0;
560 
561  // If more SGPRs are required to support the input user/system SGPRs,
562  // increase to accommodate them.
563  //
564  // FIXME: This really ends up using the requested number of SGPRs + number
565  // of reserved special registers in total. Theoretically you could re-use
566  // the last input registers for these special registers, but this would
567  // require a lot of complexity to deal with the weird aliasing.
568  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
569  if (Requested && Requested < InputNumSGPRs)
570  Requested = InputNumSGPRs;
571 
572  // Make sure requested value is compatible with values implied by
573  // default/requested minimum/maximum number of waves per execution unit.
574  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
575  Requested = 0;
576  if (WavesPerEU.second &&
577  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
578  Requested = 0;
579 
580  if (Requested)
581  MaxNumSGPRs = Requested;
582  }
583 
584  if (hasSGPRInitBug())
586 
587  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
588  MaxAddressableNumSGPRs);
589 }
590 
592  const Function &F = MF.getFunction();
594 
595  // Compute maximum number of VGPRs function can use using default/requested
596  // minimum number of waves per execution unit.
597  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
598  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
599 
600  // Check if maximum number of VGPRs was explicitly requested using
601  // "amdgpu-num-vgpr" attribute.
602  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
603  unsigned Requested = AMDGPU::getIntegerAttribute(
604  F, "amdgpu-num-vgpr", MaxNumVGPRs);
605 
606  // Make sure requested value is compatible with values implied by
607  // default/requested minimum/maximum number of waves per execution unit.
608  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
609  Requested = 0;
610  if (WavesPerEU.second &&
611  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
612  Requested = 0;
613 
614  if (Requested)
615  MaxNumVGPRs = Requested;
616  }
617 
618  return MaxNumVGPRs;
619 }
620 
621 namespace {
622 struct MemOpClusterMutation : ScheduleDAGMutation {
623  const SIInstrInfo *TII;
624 
625  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
626 
627  void apply(ScheduleDAGInstrs *DAGInstrs) override {
628  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
629 
630  SUnit *SUa = nullptr;
631  // Search for two consequent memory operations and link them
632  // to prevent scheduler from moving them apart.
633  // In DAG pre-process SUnits are in the original order of
634  // the instructions before scheduling.
635  for (SUnit &SU : DAG->SUnits) {
636  MachineInstr &MI2 = *SU.getInstr();
637  if (!MI2.mayLoad() && !MI2.mayStore()) {
638  SUa = nullptr;
639  continue;
640  }
641  if (!SUa) {
642  SUa = &SU;
643  continue;
644  }
645 
646  MachineInstr &MI1 = *SUa->getInstr();
647  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
648  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
649  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
650  (TII->isDS(MI1) && TII->isDS(MI2))) {
651  SU.addPredBarrier(SUa);
652 
653  for (const SDep &SI : SU.Preds) {
654  if (SI.getSUnit() != SUa)
655  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
656  }
657 
658  if (&SU != &DAG->ExitSU) {
659  for (const SDep &SI : SUa->Succs) {
660  if (SI.getSUnit() != &SU)
661  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
662  }
663  }
664  }
665 
666  SUa = &SU;
667  }
668  }
669 };
670 } // namespace
671 
673  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
674  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
675 }
676 
679  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
680  else
681  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
682 }
683 
685  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
686  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
687  else
688  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
689 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:30
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:448
unsigned getImplicitArgNumBytes(const Function &F) const
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:321
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:864
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:435
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:260
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:445
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:192
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:198
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:469
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:365
const HexagonInstrInfo * TII
int getLocalMemorySize() const
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:137
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1444
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1182
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:221
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:290
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:484
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:189
Scheduling dependency.
Definition: ScheduleDAG.h:50
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:820
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:377
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:216
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
The AMDGPU TargetMachine interface definition for hw codgen targets.
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1226
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Information about stack frame layout on the target.
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:388
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:194
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:70
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
amdgpu Simplify well known AMD library false Value Value * Arg
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:64
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:568
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPUSubtarget(const Triple &TT)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:208
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:331
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:261
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:73
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:566
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
iterator_range< arg_iterator > args()
Definition: Function.h:689
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:246
const BasicBlock * getParent() const
Definition: Instruction.h:67
const SIRegisterInfo * getRegisterInfo() const override