LLVM 22.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form",
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
42
44 const SITargetLowering *TLI = STI->getTargetLowering();
45 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46}
47
49
51 const GCNSubtarget *STI)
52 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
53 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
54 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
55 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
56 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
57 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
58 IsWholeWaveFunction(F.getCallingConv() ==
60 const GCNSubtarget &ST = *STI;
61 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62 WavesPerEU = ST.getWavesPerEU(F);
63 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64 assert(MaxNumWorkGroups.size() == 3);
65
66 // Temporarily check both the attribute and the subtarget feature, until the
67 // latter is completely removed.
68 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
70 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
73 CallingConv::ID CC = F.getCallingConv();
74
75 VRegFlags.reserve(1024);
76
77 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
79
80 if (IsKernel) {
81 WorkGroupIDX = true;
82 WorkItemIDX = true;
83 } else if (CC == CallingConv::AMDGPU_PS) {
84 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85 }
86
87 if (ST.hasGFX90AInsts()) {
88 // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89 // allocation granule and clamping.
90 auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
92 /*OnlyFirstRequired=*/true);
93 MinNumAGPRs = MinNumAGPRAttr;
94 }
95
96 if (AMDGPU::isChainCC(CC)) {
97 // Chain functions don't receive an SP from their caller, but are free to
98 // set one up. For now, we can use s32 to match what amdgpu_gfx functions
99 // would use if called, but this can be revisited.
100 // FIXME: Only reserve this if we actually need it.
101 StackPtrOffsetReg = AMDGPU::SGPR32;
102
103 ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
104
105 ArgInfo.PrivateSegmentBuffer =
106 ArgDescriptor::createRegister(ScratchRSrcReg);
107
108 ImplicitArgPtr = false;
109 } else if (!isEntryFunction()) {
110 if (CC != CallingConv::AMDGPU_Gfx &&
113
114 FrameOffsetReg = AMDGPU::SGPR33;
115 StackPtrOffsetReg = AMDGPU::SGPR32;
116
117 if (!ST.enableFlatScratch()) {
118 // Non-entry functions have no special inputs for now, other registers
119 // required for scratch access.
120 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
121
122 ArgInfo.PrivateSegmentBuffer =
123 ArgDescriptor::createRegister(ScratchRSrcReg);
124 }
125
126 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
127 ImplicitArgPtr = true;
128 } else {
129 ImplicitArgPtr = false;
131 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
132 }
133
134 if (!AMDGPU::isGraphics(CC) ||
136 ST.hasArchitectedSGPRs())) {
137 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x") ||
138 !F.hasFnAttribute("amdgpu-no-cluster-id-x"))
139 WorkGroupIDX = true;
140
141 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y") ||
142 !F.hasFnAttribute("amdgpu-no-cluster-id-y"))
143 WorkGroupIDY = true;
144
145 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z") ||
146 !F.hasFnAttribute("amdgpu-no-cluster-id-z"))
147 WorkGroupIDZ = true;
148 }
149
150 if (!AMDGPU::isGraphics(CC)) {
151 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
152 WorkItemIDX = true;
153
154 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
155 ST.getMaxWorkitemID(F, 1) != 0)
156 WorkItemIDY = true;
157
158 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
159 ST.getMaxWorkitemID(F, 2) != 0)
160 WorkItemIDZ = true;
161
162 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
163 LDSKernelId = true;
164 }
165
166 if (isEntryFunction()) {
167 // X, XY, and XYZ are the only supported combinations, so make sure Y is
168 // enabled if Z is.
169 if (WorkItemIDZ)
170 WorkItemIDY = true;
171
172 if (!ST.flatScratchIsArchitected()) {
173 PrivateSegmentWaveByteOffset = true;
174
175 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
176 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
178 ArgInfo.PrivateSegmentWaveByteOffset =
179 ArgDescriptor::createRegister(AMDGPU::SGPR5);
180 }
181 }
182
183 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
184 StringRef S = A.getValueAsString();
185 if (!S.empty())
186 S.consumeInteger(0, GITPtrHigh);
187
188 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
189 S = A.getValueAsString();
190 if (!S.empty())
191 S.consumeInteger(0, HighBitsOf32BitAddress);
192
193 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
194 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
195
196 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
197 // VGPR available at all times. For now, reserve highest available VGPR. After
198 // RA, shift it to the lowest available unused VGPR if the one exist.
199 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
200 VGPRForAGPRCopy =
201 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
202 }
203
204 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
205}
206
213
216 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
217 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
218}
219
221 const SIRegisterInfo &TRI) {
222 ArgInfo.PrivateSegmentBuffer =
223 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
224 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
225 NumUserSGPRs += 4;
226 return ArgInfo.PrivateSegmentBuffer.getRegister();
227}
228
230 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
231 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
232 NumUserSGPRs += 2;
233 return ArgInfo.DispatchPtr.getRegister();
234}
235
237 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
238 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
239 NumUserSGPRs += 2;
240 return ArgInfo.QueuePtr.getRegister();
241}
242
244 ArgInfo.KernargSegmentPtr
245 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
246 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
247 NumUserSGPRs += 2;
248 return ArgInfo.KernargSegmentPtr.getRegister();
249}
250
252 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
253 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
254 NumUserSGPRs += 2;
255 return ArgInfo.DispatchID.getRegister();
256}
257
259 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
260 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
261 NumUserSGPRs += 2;
262 return ArgInfo.FlatScratchInit.getRegister();
263}
264
266 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
267 NumUserSGPRs += 1;
268 return ArgInfo.PrivateSegmentSize.getRegister();
269}
270
272 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
273 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
274 NumUserSGPRs += 2;
275 return ArgInfo.ImplicitBufferPtr.getRegister();
276}
277
279 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
280 NumUserSGPRs += 1;
281 return ArgInfo.LDSKernelId.getRegister();
282}
283
285 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
286 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
287 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
288 assert(Inserted && "Preload kernel argument allocated twice.");
289 NumUserSGPRs += PaddingSGPRs;
290 // If the available register tuples are aligned with the kernarg to be
291 // preloaded use that register, otherwise we need to use a set of SGPRs and
292 // merge them.
293 if (!ArgInfo.FirstKernArgPreloadReg)
294 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
295 Register PreloadReg =
296 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
297 auto &Regs = It->second.Regs;
298 if (PreloadReg &&
299 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
300 Regs.push_back(PreloadReg);
301 NumUserSGPRs += AllocSizeDWord;
302 } else {
303 Regs.reserve(AllocSizeDWord);
304 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
305 Regs.push_back(getNextUserSGPR());
306 NumUserSGPRs++;
307 }
308 }
309
310 // Track the actual number of SGPRs that HW will preload to.
311 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
312 return &Regs;
313}
314
316 uint64_t Size, Align Alignment) {
317 // Skip if it is an entry function or the register is already added.
318 if (isEntryFunction() || WWMSpills.count(VGPR))
319 return;
320
321 // Skip if this is a function with the amdgpu_cs_chain or
322 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
323 // We never need to allocate a spill for these because we don't even need to
324 // restore the inactive lanes for them (they're scratchier than the usual
325 // scratch registers). We only need to do this if we have calls to
326 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
327 // chain functions do not return) and the function did not contain a call to
328 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
329 // when entering the function).
330 if (isChainFunction() &&
333 return;
334
335 WWMSpills.insert(std::make_pair(
336 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
337}
338
339// Separate out the callee-saved and scratch registers.
341 MachineFunction &MF,
342 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
343 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
344 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
345 for (auto &Reg : WWMSpills) {
346 if (isCalleeSavedReg(CSRegs, Reg.first))
347 CalleeSavedRegs.push_back(Reg);
348 else
349 ScratchRegs.push_back(Reg);
350 }
351}
352
354 MCPhysReg Reg) const {
355 for (unsigned I = 0; CSRegs[I]; ++I) {
356 if (CSRegs[I] == Reg)
357 return true;
358 }
359
360 return false;
361}
362
365 BitVector &SavedVGPRs) {
366 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
368 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
369 Register Reg = WWMVGPRs[I];
370 Register NewReg =
371 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
372 if (!NewReg || NewReg >= Reg)
373 break;
374
375 MRI.replaceRegWith(Reg, NewReg);
376
377 // Update various tables with the new VGPR.
378 WWMVGPRs[I] = NewReg;
379 WWMReservedRegs.remove(Reg);
380 WWMReservedRegs.insert(NewReg);
381 MRI.reserveReg(NewReg, TRI);
382
383 // Replace the register in SpillPhysVGPRs. This is needed to look for free
384 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
385 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
386 if (RegItr != SpillPhysVGPRs.end()) {
387 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
388 SpillPhysVGPRs[Idx] = NewReg;
389 }
390
391 // The generic `determineCalleeSaves` might have set the old register if it
392 // is in the CSR range.
393 SavedVGPRs.reset(Reg);
394
395 for (MachineBasicBlock &MBB : MF) {
396 MBB.removeLiveIn(Reg);
397 MBB.sortUniqueLiveIns();
398 }
399
400 Reg = NewReg;
401 }
402}
403
404bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
405 MachineFunction &MF, int FI, unsigned LaneIndex) {
407 Register LaneVGPR;
408 if (!LaneIndex) {
409 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
410 SpillVGPRs.push_back(LaneVGPR);
411 } else {
412 LaneVGPR = SpillVGPRs.back();
413 }
414
415 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
416 return true;
417}
418
419bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
420 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
421 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
422 const SIRegisterInfo *TRI = ST.getRegisterInfo();
423 MachineRegisterInfo &MRI = MF.getRegInfo();
424 Register LaneVGPR;
425 if (!LaneIndex) {
426 // Find the highest available register if called before RA to ensure the
427 // lowest registers are available for allocation. The LaneVGPR, in that
428 // case, will be shifted back to the lowest range after VGPR allocation.
429 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
430 !IsPrologEpilog);
431 if (LaneVGPR == AMDGPU::NoRegister) {
432 // We have no VGPRs left for spilling SGPRs. Reset because we will not
433 // partially spill the SGPR to VGPRs.
434 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
435 return false;
436 }
437
438 if (IsPrologEpilog)
439 allocateWWMSpill(MF, LaneVGPR);
440
441 reserveWWMRegister(LaneVGPR);
442 for (MachineBasicBlock &MBB : MF) {
443 MBB.addLiveIn(LaneVGPR);
445 }
446 SpillPhysVGPRs.push_back(LaneVGPR);
447 } else {
448 LaneVGPR = SpillPhysVGPRs.back();
449 }
450
451 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
452 return true;
453}
454
456 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
457 bool IsPrologEpilog) {
458 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
459 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
460 : SGPRSpillsToVirtualVGPRLanes[FI];
461
462 // This has already been allocated.
463 if (!SpillLanes.empty())
464 return true;
465
466 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
467 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
468 unsigned WaveSize = ST.getWavefrontSize();
469
470 unsigned Size = FrameInfo.getObjectSize(FI);
471 unsigned NumLanes = Size / 4;
472
473 if (NumLanes > WaveSize)
474 return false;
475
476 assert(Size >= 4 && "invalid sgpr spill size");
477 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
478 "not spilling SGPRs to VGPRs");
479
480 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
481 : NumVirtualVGPRSpillLanes;
482
483 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
484 unsigned LaneIndex = (NumSpillLanes % WaveSize);
485
486 bool Allocated = SpillToPhysVGPRLane
487 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
488 IsPrologEpilog)
489 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
490 if (!Allocated) {
491 NumSpillLanes -= I;
492 return false;
493 }
494 }
495
496 return true;
497}
498
499/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
500/// Either AGPR is spilled to VGPR to vice versa.
501/// Returns true if a \p FI can be eliminated completely.
503 int FI,
504 bool isAGPRtoVGPR) {
506 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
507 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
508
509 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
510
511 auto &Spill = VGPRToAGPRSpills[FI];
512
513 // This has already been allocated.
514 if (!Spill.Lanes.empty())
515 return Spill.FullyAllocated;
516
517 unsigned Size = FrameInfo.getObjectSize(FI);
518 unsigned NumLanes = Size / 4;
519 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
520
521 const TargetRegisterClass &RC =
522 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
523 auto Regs = RC.getRegisters();
524
525 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
527 Spill.FullyAllocated = true;
528
529 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
530 // once.
531 BitVector OtherUsedRegs;
532 OtherUsedRegs.resize(TRI->getNumRegs());
533
534 const uint32_t *CSRMask =
535 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
536 if (CSRMask)
537 OtherUsedRegs.setBitsInMask(CSRMask);
538
539 // TODO: Should include register tuples, but doesn't matter with current
540 // usage.
541 for (MCPhysReg Reg : SpillAGPR)
542 OtherUsedRegs.set(Reg);
543 for (MCPhysReg Reg : SpillVGPR)
544 OtherUsedRegs.set(Reg);
545
546 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
547 for (int I = NumLanes - 1; I >= 0; --I) {
548 NextSpillReg = std::find_if(
549 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
550 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
551 !OtherUsedRegs[Reg];
552 });
553
554 if (NextSpillReg == Regs.end()) { // Registers exhausted
555 Spill.FullyAllocated = false;
556 break;
557 }
558
559 OtherUsedRegs.set(*NextSpillReg);
560 SpillRegs.push_back(*NextSpillReg);
561 MRI.reserveReg(*NextSpillReg, TRI);
562 Spill.Lanes[I] = *NextSpillReg++;
563 }
564
565 return Spill.FullyAllocated;
566}
567
569 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
570 // Remove dead frame indices from function frame, however keep FP & BP since
571 // spills for them haven't been inserted yet. And also make sure to remove the
572 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
573 // otherwise, it could result in an unexpected side effect and bug, in case of
574 // any re-mapping of freed frame indices by later pass(es) like "stack slot
575 // coloring".
576 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
577 MFI.RemoveStackObject(R.first);
578 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
579 }
580
581 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
582 // VGPR lanes during SILowerSGPRSpills pass.
583 if (!ResetSGPRSpillStackIDs) {
584 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
585 MFI.RemoveStackObject(R.first);
586 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
587 }
588 }
589 bool HaveSGPRToMemory = false;
590
591 if (ResetSGPRSpillStackIDs) {
592 // All other SGPRs must be allocated on the default stack, so reset the
593 // stack ID.
594 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
595 ++I) {
599 HaveSGPRToMemory = true;
600 }
601 }
602 }
603 }
604
605 for (auto &R : VGPRToAGPRSpills) {
606 if (R.second.IsDead)
607 MFI.RemoveStackObject(R.first);
608 }
609
610 return HaveSGPRToMemory;
611}
612
614 const SIRegisterInfo &TRI) {
615 if (ScavengeFI)
616 return *ScavengeFI;
617
618 ScavengeFI =
619 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
620 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
621 return *ScavengeFI;
622}
623
624MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
625 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
626 return AMDGPU::SGPR0 + NumUserSGPRs;
627}
628
629MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
630 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
631}
632
633void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
634 VRegFlags.grow(Reg);
635}
636
637void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
638 Register SrcReg) {
639 VRegFlags.grow(NewReg);
640 VRegFlags[NewReg] = VRegFlags[SrcReg];
641}
642
645 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
646 if (!ST.isAmdPalOS())
647 return Register();
648 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
649 if (ST.hasMergedShaders()) {
650 switch (MF.getFunction().getCallingConv()) {
653 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
654 // ES+GS merged shader on gfx9+.
655 GitPtrLo = AMDGPU::SGPR8;
656 return GitPtrLo;
657 default:
658 return GitPtrLo;
659 }
660 }
661 return GitPtrLo;
662}
663
665 const TargetRegisterInfo &TRI) {
667 {
668 raw_string_ostream OS(Dest.Value);
669 OS << printReg(Reg, &TRI);
670 }
671 return Dest;
672}
673
674static std::optional<yaml::SIArgumentInfo>
676 const TargetRegisterInfo &TRI) {
678
679 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
680 const ArgDescriptor &Arg) {
681 if (!Arg)
682 return false;
683
684 // Create a register or stack argument.
686 if (Arg.isRegister()) {
688 OS << printReg(Arg.getRegister(), &TRI);
689 } else
690 SA.StackOffset = Arg.getStackOffset();
691 // Check and update the optional mask.
692 if (Arg.isMasked())
693 SA.Mask = Arg.getMask();
694
695 A = SA;
696 return true;
697 };
698
699 // TODO: Need to serialize kernarg preloads.
700 bool Any = false;
701 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
702 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
703 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
704 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
705 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
706 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
707 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
708 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
709 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
710 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
711 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
712 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
713 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
714 ArgInfo.PrivateSegmentWaveByteOffset);
715 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
716 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
717 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
718 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
719 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
720
721 if (Any)
722 return AI;
723
724 return std::nullopt;
725}
726
729 const llvm::MachineFunction &MF)
730 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
731 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
732 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
733 IsEntryFunction(MFI.isEntryFunction()),
734 NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
735 MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
736 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
737 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
738 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
739 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
740 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
741 Occupancy(MFI.getOccupancy()),
742 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
743 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
744 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
745 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
746 ReturnsVoid(MFI.returnsVoid()),
747 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
748 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
749 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
750 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
751 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
752 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
753 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
754 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
755 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
756
757 for (Register Reg : MFI.getWWMReservedRegs())
758 WWMReservedRegs.push_back(regToString(Reg, TRI));
759
760 if (MFI.getLongBranchReservedReg())
762 if (MFI.getVGPRForAGPRCopy())
764
765 if (MFI.getSGPRForEXECCopy())
767
768 auto SFI = MFI.getOptionalScavengeFI();
769 if (SFI)
771}
772
776
778 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
782 LDSSize = YamlMFI.LDSSize;
783 GDSSize = YamlMFI.GDSSize;
784 DynLDSAlign = YamlMFI.DynLDSAlign;
785 PSInputAddr = YamlMFI.PSInputAddr;
786 PSInputEnable = YamlMFI.PSInputEnable;
787 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
788 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
789 Occupancy = YamlMFI.Occupancy;
792 MemoryBound = YamlMFI.MemoryBound;
793 WaveLimiter = YamlMFI.WaveLimiter;
794 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
795 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
796 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
797 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
798 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
799 ReturnsVoid = YamlMFI.ReturnsVoid;
800 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
801
802 if (YamlMFI.ScavengeFI) {
803 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
804 if (!FIOrErr) {
805 // Create a diagnostic for a the frame index.
806 const MemoryBuffer &Buffer =
807 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
808
809 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
810 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
811 "", {}, {});
812 SourceRange = YamlMFI.ScavengeFI->SourceRange;
813 return true;
814 }
815 ScavengeFI = *FIOrErr;
816 } else {
817 ScavengeFI = std::nullopt;
818 }
819 return false;
820}
821
823 auto [MinNumAGPR, MaxNumAGPR] =
824 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
825 /*OnlyFirstRequired=*/true);
826 return MinNumAGPR != 0u;
827}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
static cl::opt< bool, true > MFMAVGPRFormOpt("amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), cl::Hidden)
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST)
Align DynLDSAlign
Align for dynamic shared memory if any.
uint32_t LDSSize
Number of bytes in the LDS that are being used.
static ClusterDimsAttr get(const Function &F)
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
BitVector & reset()
Definition BitVector.h:411
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition BitVector.h:360
BitVector & set()
Definition BitVector.h:370
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition BitVector.h:723
void push_back(bool Val)
Definition BitVector.h:485
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
bool isSpillSlotObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a spill slot.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:282
Represents a location in source code.
Definition SMLoc.h:23
Represents a range in source code.
Definition SMLoc.h:48
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:133
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:126
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:501
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:62
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.