Line data Source code
1 : //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "AMDGPUSubtarget.h"
16 : #include "AMDGPU.h"
17 : #include "AMDGPUTargetMachine.h"
18 : #include "AMDGPUCallLowering.h"
19 : #include "AMDGPUInstructionSelector.h"
20 : #include "AMDGPULegalizerInfo.h"
21 : #include "AMDGPURegisterBankInfo.h"
22 : #include "SIMachineFunctionInfo.h"
23 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 : #include "llvm/ADT/SmallString.h"
25 : #include "llvm/CodeGen/MachineScheduler.h"
26 : #include "llvm/MC/MCSubtargetInfo.h"
27 : #include "llvm/IR/MDBuilder.h"
28 : #include "llvm/CodeGen/TargetFrameLowering.h"
29 : #include <algorithm>
30 :
31 : using namespace llvm;
32 :
33 : #define DEBUG_TYPE "amdgpu-subtarget"
34 :
35 : #define GET_SUBTARGETINFO_TARGET_DESC
36 : #define GET_SUBTARGETINFO_CTOR
37 : #define AMDGPUSubtarget GCNSubtarget
38 : #include "AMDGPUGenSubtargetInfo.inc"
39 : #define GET_SUBTARGETINFO_TARGET_DESC
40 : #define GET_SUBTARGETINFO_CTOR
41 : #undef AMDGPUSubtarget
42 : #include "R600GenSubtargetInfo.inc"
43 :
44 : GCNSubtarget::~GCNSubtarget() = default;
45 :
46 : R600Subtarget &
47 291 : R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48 : StringRef GPU, StringRef FS) {
49 : SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50 : FullFS += FS;
51 291 : ParseSubtargetFeatures(GPU, FullFS);
52 :
53 : // FIXME: I don't think think Evergreen has any useful support for
54 : // denormals, but should be checked. Should we issue a warning somewhere
55 : // if someone tries to enable these?
56 291 : if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57 291 : FP32Denormals = false;
58 : }
59 :
60 291 : HasMulU24 = getGeneration() >= EVERGREEN;
61 291 : HasMulI24 = hasCaymanISA();
62 :
63 291 : return *this;
64 : }
65 :
66 : GCNSubtarget &
67 2492 : GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68 : StringRef GPU, StringRef FS) {
69 : // Determine default and user-specified characteristics
70 : // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71 : // enabled, but some instructions do not respect them and they run at the
72 : // double precision rate, so don't enable by default.
73 : //
74 : // We want to be able to turn these off, but making this a subtarget feature
75 : // for SI has the unhelpful behavior that it unsets everything else if you
76 : // disable it.
77 :
78 : SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 :
80 2492 : if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81 : FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 :
83 : // FIXME: I don't think think Evergreen has any useful support for
84 : // denormals, but should be checked. Should we issue a warning somewhere
85 : // if someone tries to enable these?
86 2492 : if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87 : FullFS += "+fp64-fp16-denormals,";
88 : } else {
89 : FullFS += "-fp32-denormals,";
90 : }
91 :
92 : FullFS += FS;
93 :
94 2492 : ParseSubtargetFeatures(GPU, FullFS);
95 :
96 : // We don't support FP64 for EG/NI atm.
97 : assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98 :
99 : // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100 : // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101 : // variants of MUBUF instructions.
102 3731 : if (!hasAddr64() && !FS.contains("flat-for-global")) {
103 886 : FlatForGlobal = true;
104 : }
105 :
106 : // Set defaults if needed.
107 2492 : if (MaxPrivateElementSize == 0)
108 2471 : MaxPrivateElementSize = 4;
109 :
110 2492 : if (LDSBankCount == 0)
111 720 : LDSBankCount = 32;
112 :
113 2492 : if (TT.getArch() == Triple::amdgcn) {
114 2473 : if (LocalMemorySize == 0)
115 701 : LocalMemorySize = 32768;
116 :
117 : // Do something sensible for unspecified target.
118 2473 : if (!HasMovrel && !HasVGPRIndexMode)
119 701 : HasMovrel = true;
120 : }
121 :
122 2492 : HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123 :
124 2492 : return *this;
125 : }
126 :
127 2783 : AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
128 : TargetTriple(TT),
129 : Has16BitInsts(false),
130 : HasMadMixInsts(false),
131 : FP32Denormals(false),
132 : FPExceptions(false),
133 : HasSDWA(false),
134 : HasVOP3PInsts(false),
135 : HasMulI24(true),
136 : HasMulU24(true),
137 : HasInv2PiInlineImm(false),
138 : HasFminFmaxLegacy(true),
139 : EnablePromoteAlloca(false),
140 : HasTrigReducedRange(false),
141 : LocalMemorySize(0),
142 2783 : WavefrontSize(0)
143 2783 : { }
144 :
145 2492 : GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
146 2492 : const GCNTargetMachine &TM) :
147 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
148 : AMDGPUSubtarget(TT),
149 : TargetTriple(TT),
150 : Gen(SOUTHERN_ISLANDS),
151 : IsaVersion(ISAVersion0_0_0),
152 2492 : InstrItins(getInstrItineraryForCPU(GPU)),
153 : LDSBankCount(0),
154 : MaxPrivateElementSize(0),
155 :
156 : FastFMAF32(false),
157 : HalfRate64Ops(false),
158 :
159 : FP64FP16Denormals(false),
160 : DX10Clamp(false),
161 : FlatForGlobal(false),
162 : AutoWaitcntBeforeBarrier(false),
163 : CodeObjectV3(false),
164 : UnalignedScratchAccess(false),
165 : UnalignedBufferAccess(false),
166 :
167 : HasApertureRegs(false),
168 : EnableXNACK(false),
169 : TrapHandler(false),
170 : DebuggerInsertNops(false),
171 : DebuggerEmitPrologue(false),
172 :
173 : EnableHugePrivateBuffer(false),
174 : EnableVGPRSpilling(false),
175 : EnableLoadStoreOpt(false),
176 : EnableUnsafeDSOffsetFolding(false),
177 : EnableSIScheduler(false),
178 : EnableDS128(false),
179 : DumpCode(false),
180 :
181 : FP64(false),
182 : GCN3Encoding(false),
183 : CIInsts(false),
184 : VIInsts(false),
185 : GFX9Insts(false),
186 : SGPRInitBug(false),
187 : HasSMemRealTime(false),
188 : HasIntClamp(false),
189 : HasFmaMixInsts(false),
190 : HasMovrel(false),
191 : HasVGPRIndexMode(false),
192 : HasScalarStores(false),
193 : HasScalarAtomics(false),
194 : HasSDWAOmod(false),
195 : HasSDWAScalar(false),
196 : HasSDWASdst(false),
197 : HasSDWAMac(false),
198 : HasSDWAOutModsVOPC(false),
199 : HasDPP(false),
200 : HasR128A16(false),
201 : HasDLInsts(false),
202 : D16PreservesUnusedBits(false),
203 : FlatAddressSpace(false),
204 : FlatInstOffsets(false),
205 : FlatGlobalInsts(false),
206 : FlatScratchInsts(false),
207 : AddNoCarryInsts(false),
208 : HasUnpackedD16VMem(false),
209 :
210 : ScalarizeGlobal(false),
211 :
212 : FeatureDisable(false),
213 2492 : InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
214 : TLInfo(TM, *this),
215 2492 : FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
216 2492 : CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
217 2492 : Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
218 2492 : RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
219 : InstSelector.reset(new AMDGPUInstructionSelector(
220 2492 : *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
221 2492 : }
222 :
223 18943 : unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
224 : const Function &F) const {
225 18943 : if (NWaves == 1)
226 26 : return getLocalMemorySize();
227 18917 : unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
228 18917 : unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
229 : unsigned MaxWaves = getMaxWavesPerEU();
230 18917 : return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
231 : }
232 :
233 184036 : unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
234 : const Function &F) const {
235 184036 : unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
236 184036 : unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
237 184036 : unsigned MaxWaves = getMaxWavesPerEU();
238 184036 : unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
239 364852 : unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
240 184036 : NumWaves = std::min(NumWaves, MaxWaves);
241 184036 : NumWaves = std::max(NumWaves, 1u);
242 184036 : return NumWaves;
243 : }
244 :
245 : unsigned
246 0 : AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
247 : const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
248 0 : return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
249 : }
250 :
251 : std::pair<unsigned, unsigned>
252 267721 : AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
253 : switch (CC) {
254 219060 : case CallingConv::AMDGPU_CS:
255 : case CallingConv::AMDGPU_KERNEL:
256 : case CallingConv::SPIR_KERNEL:
257 219060 : return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
258 23495 : case CallingConv::AMDGPU_VS:
259 : case CallingConv::AMDGPU_LS:
260 : case CallingConv::AMDGPU_HS:
261 : case CallingConv::AMDGPU_ES:
262 : case CallingConv::AMDGPU_GS:
263 : case CallingConv::AMDGPU_PS:
264 23495 : return std::make_pair(1, getWavefrontSize());
265 25166 : default:
266 25166 : return std::make_pair(1, 16 * getWavefrontSize());
267 : }
268 : }
269 :
270 267721 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
271 : const Function &F) const {
272 : // FIXME: 1024 if function.
273 : // Default minimum/maximum flat work group sizes.
274 : std::pair<unsigned, unsigned> Default =
275 267721 : getDefaultFlatWorkGroupSize(F.getCallingConv());
276 :
277 : // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
278 : // starts using "amdgpu-flat-work-group-size" attribute.
279 267721 : Default.second = AMDGPU::getIntegerAttribute(
280 267721 : F, "amdgpu-max-work-group-size", Default.second);
281 267721 : Default.first = std::min(Default.first, Default.second);
282 :
283 : // Requested minimum/maximum flat work group sizes.
284 267721 : std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
285 267721 : F, "amdgpu-flat-work-group-size", Default);
286 :
287 : // Make sure requested minimum is less than requested maximum.
288 267721 : if (Requested.first > Requested.second)
289 0 : return Default;
290 :
291 : // Make sure requested values do not violate subtarget's specifications.
292 267721 : if (Requested.first < getMinFlatWorkGroupSize())
293 73 : return Default;
294 267648 : if (Requested.second > getMaxFlatWorkGroupSize())
295 0 : return Default;
296 :
297 267648 : return Requested;
298 : }
299 :
300 39614 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
301 : const Function &F) const {
302 : // Default minimum/maximum number of waves per execution unit.
303 : std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
304 :
305 : // Default/requested minimum/maximum flat work group sizes.
306 39614 : std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
307 :
308 : // If minimum/maximum flat work group sizes were explicitly requested using
309 : // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
310 : // number of waves per execution unit to values implied by requested
311 : // minimum/maximum flat work group sizes.
312 : unsigned MinImpliedByFlatWorkGroupSize =
313 39614 : getMaxWavesPerEU(FlatWorkGroupSizes.second);
314 : bool RequestedFlatWorkGroupSize = false;
315 :
316 : // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
317 : // starts using "amdgpu-flat-work-group-size" attribute.
318 39614 : if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
319 39600 : F.hasFnAttribute("amdgpu-flat-work-group-size")) {
320 : Default.first = MinImpliedByFlatWorkGroupSize;
321 : RequestedFlatWorkGroupSize = true;
322 : }
323 :
324 : // Requested minimum/maximum number of waves per execution unit.
325 39614 : std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
326 39614 : F, "amdgpu-waves-per-eu", Default, true);
327 :
328 : // Make sure requested minimum is less than requested maximum.
329 39614 : if (Requested.second && Requested.first > Requested.second)
330 0 : return Default;
331 :
332 : // Make sure requested values do not violate subtarget's specifications.
333 39614 : if (Requested.first < getMinWavesPerEU() ||
334 : Requested.first > getMaxWavesPerEU())
335 0 : return Default;
336 39614 : if (Requested.second > getMaxWavesPerEU())
337 0 : return Default;
338 :
339 : // Make sure requested values are compatible with values implied by requested
340 : // minimum/maximum flat work group sizes.
341 39614 : if (RequestedFlatWorkGroupSize &&
342 : Requested.first < MinImpliedByFlatWorkGroupSize)
343 2 : return Default;
344 :
345 39612 : return Requested;
346 : }
347 :
348 3969 : bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
349 3969 : Function *Kernel = I->getParent()->getParent();
350 : unsigned MinSize = 0;
351 3969 : unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
352 : bool IdQuery = false;
353 :
354 : // If reqd_work_group_size is present it narrows value down.
355 : if (auto *CI = dyn_cast<CallInst>(I)) {
356 : const Function *F = CI->getCalledFunction();
357 : if (F) {
358 : unsigned Dim = UINT_MAX;
359 3903 : switch (F->getIntrinsicID()) {
360 3211 : case Intrinsic::amdgcn_workitem_id_x:
361 : case Intrinsic::r600_read_tidig_x:
362 : IdQuery = true;
363 : LLVM_FALLTHROUGH;
364 : case Intrinsic::r600_read_local_size_x:
365 : Dim = 0;
366 : break;
367 243 : case Intrinsic::amdgcn_workitem_id_y:
368 : case Intrinsic::r600_read_tidig_y:
369 : IdQuery = true;
370 : LLVM_FALLTHROUGH;
371 : case Intrinsic::r600_read_local_size_y:
372 : Dim = 1;
373 : break;
374 190 : case Intrinsic::amdgcn_workitem_id_z:
375 : case Intrinsic::r600_read_tidig_z:
376 : IdQuery = true;
377 : LLVM_FALLTHROUGH;
378 : case Intrinsic::r600_read_local_size_z:
379 : Dim = 2;
380 : break;
381 : default:
382 : break;
383 : }
384 : if (Dim <= 3) {
385 7806 : if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
386 8 : if (Node->getNumOperands() == 3)
387 8 : MinSize = MaxSize = mdconst::extract<ConstantInt>(
388 : Node->getOperand(Dim))->getZExtValue();
389 : }
390 : }
391 : }
392 :
393 3969 : if (!MaxSize)
394 : return false;
395 :
396 : // Range metadata is [Lo, Hi). For ID query we need to pass max size
397 : // as Hi. For size query we need to pass Hi + 1.
398 3969 : if (IdQuery)
399 : MinSize = 0;
400 : else
401 325 : ++MaxSize;
402 :
403 3969 : MDBuilder MDB(I->getContext());
404 7938 : MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
405 3969 : APInt(32, MaxSize));
406 3969 : I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
407 3969 : return true;
408 : }
409 :
410 38433 : uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
411 : unsigned &MaxAlign) const {
412 : assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
413 : F.getCallingConv() == CallingConv::SPIR_KERNEL);
414 :
415 38433 : const DataLayout &DL = F.getParent()->getDataLayout();
416 : uint64_t ExplicitArgBytes = 0;
417 38433 : MaxAlign = 1;
418 :
419 122838 : for (const Argument &Arg : F.args()) {
420 84405 : Type *ArgTy = Arg.getType();
421 :
422 84405 : unsigned Align = DL.getABITypeAlignment(ArgTy);
423 84405 : uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
424 168810 : ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
425 84405 : MaxAlign = std::max(MaxAlign, Align);
426 : }
427 :
428 38433 : return ExplicitArgBytes;
429 : }
430 :
431 19817 : unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
432 : unsigned &MaxAlign) const {
433 19817 : uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
434 :
435 : unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
436 :
437 19817 : uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
438 19817 : unsigned ImplicitBytes = getImplicitArgNumBytes(F);
439 19817 : if (ImplicitBytes != 0) {
440 : unsigned Alignment = getAlignmentForImplicitArgPtr();
441 1756 : TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
442 : }
443 :
444 : // Being able to dereference past the end is useful for emitting scalar loads.
445 19817 : return alignTo(TotalSize, 4);
446 : }
447 :
448 291 : R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
449 291 : const TargetMachine &TM) :
450 : R600GenSubtargetInfo(TT, GPU, FS),
451 : AMDGPUSubtarget(TT),
452 : InstrInfo(*this),
453 : FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
454 : FMA(false),
455 : CaymanISA(false),
456 : CFALUBug(false),
457 : DX10Clamp(false),
458 : HasVertexCache(false),
459 : R600ALUInst(false),
460 : FP64(false),
461 : TexVTXClauseSize(0),
462 : Gen(R600),
463 291 : TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
464 582 : InstrItins(getInstrItineraryForCPU(GPU)) { }
465 :
466 47458 : void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
467 : unsigned NumRegionInstrs) const {
468 : // Track register pressure so the scheduler can try to decrease
469 : // pressure once register usage is above the threshold defined by
470 : // SIRegisterInfo::getRegPressureSetLimit()
471 47458 : Policy.ShouldTrackPressure = true;
472 :
473 : // Enabling both top down and bottom up scheduling seems to give us less
474 : // register spills than just using one of these approaches on its own.
475 47458 : Policy.OnlyTopDown = false;
476 47458 : Policy.OnlyBottomUp = false;
477 :
478 : // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
479 47458 : if (!enableSIScheduler())
480 47455 : Policy.ShouldTrackLaneMasks = true;
481 47458 : }
482 :
483 24723 : bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
484 24723 : return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
485 : }
486 :
487 1715 : unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
488 1715 : if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
489 1145 : if (SGPRs <= 80)
490 : return 10;
491 63 : if (SGPRs <= 88)
492 : return 9;
493 63 : if (SGPRs <= 100)
494 : return 8;
495 56 : return 7;
496 : }
497 570 : if (SGPRs <= 48)
498 : return 10;
499 135 : if (SGPRs <= 56)
500 : return 9;
501 120 : if (SGPRs <= 64)
502 : return 8;
503 105 : if (SGPRs <= 72)
504 : return 7;
505 71 : if (SGPRs <= 80)
506 2 : return 6;
507 : return 5;
508 : }
509 :
510 1715 : unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
511 1715 : if (VGPRs <= 24)
512 : return 10;
513 1078 : if (VGPRs <= 28)
514 : return 9;
515 1059 : if (VGPRs <= 32)
516 : return 8;
517 826 : if (VGPRs <= 36)
518 : return 7;
519 392 : if (VGPRs <= 40)
520 : return 6;
521 364 : if (VGPRs <= 48)
522 : return 5;
523 352 : if (VGPRs <= 64)
524 : return 4;
525 323 : if (VGPRs <= 84)
526 : return 3;
527 201 : if (VGPRs <= 128)
528 27 : return 2;
529 : return 1;
530 : }
531 :
532 141261 : unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
533 : const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
534 141261 : if (MFI.hasFlatScratchInit()) {
535 3292 : if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
536 : return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
537 1296 : if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
538 : return 4; // FLAT_SCRATCH, VCC (in that order).
539 : }
540 :
541 138275 : if (isXNACKEnabled())
542 2845 : return 4; // XNACK, VCC (in that order).
543 : return 2; // VCC.
544 : }
545 :
546 141201 : unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
547 141201 : const Function &F = MF.getFunction();
548 : const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
549 :
550 : // Compute maximum number of SGPRs function can use using default/requested
551 : // minimum number of waves per execution unit.
552 : std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
553 : unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
554 141201 : unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
555 :
556 : // Check if maximum number of SGPRs was explicitly requested using
557 : // "amdgpu-num-sgpr" attribute.
558 141201 : if (F.hasFnAttribute("amdgpu-num-sgpr")) {
559 120 : unsigned Requested = AMDGPU::getIntegerAttribute(
560 60 : F, "amdgpu-num-sgpr", MaxNumSGPRs);
561 :
562 : // Make sure requested value does not violate subtarget's specifications.
563 60 : if (Requested && (Requested <= getReservedNumSGPRs(MF)))
564 : Requested = 0;
565 :
566 : // If more SGPRs are required to support the input user/system SGPRs,
567 : // increase to accommodate them.
568 : //
569 : // FIXME: This really ends up using the requested number of SGPRs + number
570 : // of reserved special registers in total. Theoretically you could re-use
571 : // the last input registers for these special registers, but this would
572 : // require a lot of complexity to deal with the weird aliasing.
573 60 : unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
574 60 : if (Requested && Requested < InputNumSGPRs)
575 : Requested = InputNumSGPRs;
576 :
577 : // Make sure requested value is compatible with values implied by
578 : // default/requested minimum/maximum number of waves per execution unit.
579 120 : if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
580 : Requested = 0;
581 60 : if (WavesPerEU.second &&
582 120 : Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
583 : Requested = 0;
584 :
585 60 : if (Requested)
586 : MaxNumSGPRs = Requested;
587 : }
588 :
589 141201 : if (hasSGPRInitBug())
590 : MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
591 :
592 141201 : return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
593 282402 : MaxAddressableNumSGPRs);
594 : }
595 :
596 103448 : unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
597 103448 : const Function &F = MF.getFunction();
598 : const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
599 :
600 : // Compute maximum number of VGPRs function can use using default/requested
601 : // minimum number of waves per execution unit.
602 : std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
603 : unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
604 :
605 : // Check if maximum number of VGPRs was explicitly requested using
606 : // "amdgpu-num-vgpr" attribute.
607 103448 : if (F.hasFnAttribute("amdgpu-num-vgpr")) {
608 36 : unsigned Requested = AMDGPU::getIntegerAttribute(
609 18 : F, "amdgpu-num-vgpr", MaxNumVGPRs);
610 :
611 : // Make sure requested value is compatible with values implied by
612 : // default/requested minimum/maximum number of waves per execution unit.
613 36 : if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
614 : Requested = 0;
615 18 : if (WavesPerEU.second &&
616 36 : Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
617 : Requested = 0;
618 :
619 18 : if (Requested)
620 : MaxNumVGPRs = Requested;
621 : }
622 :
623 103448 : return MaxNumVGPRs;
624 : }
625 :
626 : namespace {
627 0 : struct MemOpClusterMutation : ScheduleDAGMutation {
628 : const SIInstrInfo *TII;
629 :
630 15844 : MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
631 :
632 36516 : void apply(ScheduleDAGInstrs *DAGInstrs) override {
633 : ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
634 :
635 : SUnit *SUa = nullptr;
636 : // Search for two consequent memory operations and link them
637 : // to prevent scheduler from moving them apart.
638 : // In DAG pre-process SUnits are in the original order of
639 : // the instructions before scheduling.
640 253075 : for (SUnit &SU : DAG->SUnits) {
641 216559 : MachineInstr &MI2 = *SU.getInstr();
642 216559 : if (!MI2.mayLoad() && !MI2.mayStore()) {
643 : SUa = nullptr;
644 : continue;
645 : }
646 62542 : if (!SUa) {
647 : SUa = &SU;
648 : continue;
649 : }
650 :
651 24804 : MachineInstr &MI1 = *SUa->getInstr();
652 15748 : if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
653 15748 : (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
654 12423 : (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
655 2842 : (TII->isDS(MI1) && TII->isDS(MI2))) {
656 23247 : SU.addPredBarrier(SUa);
657 :
658 244768 : for (const SDep &SI : SU.Preds) {
659 221521 : if (SI.getSUnit() != SUa)
660 185202 : SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
661 : }
662 :
663 23247 : if (&SU != &DAG->ExitSU) {
664 269449 : for (const SDep &SI : SUa->Succs) {
665 246202 : if (SI.getSUnit() != &SU)
666 209883 : SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
667 : }
668 : }
669 : }
670 :
671 : SUa = &SU;
672 : }
673 36516 : }
674 : };
675 : } // namespace
676 :
677 15844 : void GCNSubtarget::getPostRAMutations(
678 : std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
679 15844 : Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
680 15844 : }
681 :
682 41429 : const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
683 41429 : if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
684 36865 : return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
685 : else
686 4564 : return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
687 : }
688 :
689 46856 : const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
690 46856 : if (TM.getTargetTriple().getArch() == Triple::amdgcn)
691 41753 : return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
692 : else
693 5103 : return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
694 : }
|