LLVM 20.0.0git
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the lowering of LLVM calls to machine code calls for
11/// GlobalISel.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUCallLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPUTargetMachine.h"
20#include "SIRegisterInfo.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-call-lowering"
28
29using namespace llvm;
30
31namespace {
32
33/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35 Register ValVReg, const CCValAssign &VA) {
36 if (VA.getLocVT().getSizeInBits() < 32) {
37 // 16-bit types are reported as legal for 32-bit registers. We need to
38 // extend and do a 32-bit copy to avoid the verifier complaining about it.
39 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
40 }
41
42 return Handler.extendRegister(ValVReg, VA);
43}
44
45struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
48 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
49
51
54 ISD::ArgFlagsTy Flags) override {
55 llvm_unreachable("not implemented");
56 }
57
58 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59 const MachinePointerInfo &MPO,
60 const CCValAssign &VA) override {
61 llvm_unreachable("not implemented");
62 }
63
64 void assignValueToReg(Register ValVReg, Register PhysReg,
65 const CCValAssign &VA) override {
66 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
67
68 // If this is a scalar return, insert a readfirstlane just in case the value
69 // ends up in a VGPR.
70 // FIXME: Assert this is a shader return.
71 const SIRegisterInfo *TRI
72 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
73 if (TRI->isSGPRReg(MRI, PhysReg)) {
74 LLT Ty = MRI.getType(ExtReg);
75 LLT S32 = LLT::scalar(32);
76 if (Ty != S32) {
77 // FIXME: We should probably support readfirstlane intrinsics with all
78 // legal 32-bit types.
79 assert(Ty.getSizeInBits() == 32);
80 if (Ty.isPointer())
81 ExtReg = MIRBuilder.buildPtrToInt(S32, ExtReg).getReg(0);
82 else
83 ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0);
84 }
85
86 auto ToSGPR = MIRBuilder
87 .buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
88 {MRI.getType(ExtReg)})
89 .addReg(ExtReg);
90 ExtReg = ToSGPR.getReg(0);
91 }
92
93 MIRBuilder.buildCopy(PhysReg, ExtReg);
94 MIB.addUse(PhysReg, RegState::Implicit);
95 }
96};
97
98struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
99 uint64_t StackUsed = 0;
100
101 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
102 : IncomingValueHandler(B, MRI) {}
103
106 ISD::ArgFlagsTy Flags) override {
107 auto &MFI = MIRBuilder.getMF().getFrameInfo();
108
109 // Byval is assumed to be writable memory, but other stack passed arguments
110 // are not.
111 const bool IsImmutable = !Flags.isByVal();
112 int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
113 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
114 auto AddrReg = MIRBuilder.buildFrameIndex(
116 StackUsed = std::max(StackUsed, Size + Offset);
117 return AddrReg.getReg(0);
118 }
119
120 void assignValueToReg(Register ValVReg, Register PhysReg,
121 const CCValAssign &VA) override {
122 markPhysRegUsed(PhysReg);
123
124 if (VA.getLocVT().getSizeInBits() < 32) {
125 // 16-bit types are reported as legal for 32-bit registers. We need to do
126 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
127 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
128
129 // If we have signext/zeroext, it applies to the whole 32-bit register
130 // before truncation.
131 auto Extended =
132 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
133 MIRBuilder.buildTrunc(ValVReg, Extended);
134 return;
135 }
136
137 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
138 }
139
140 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
141 const MachinePointerInfo &MPO,
142 const CCValAssign &VA) override {
143 MachineFunction &MF = MIRBuilder.getMF();
144
145 auto MMO = MF.getMachineMemOperand(
147 inferAlignFromPtrInfo(MF, MPO));
148 MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
149 }
150
151 /// How the physical register gets marked varies between formal
152 /// parameters (it's a basic-block live-in), and a call instruction
153 /// (it's an implicit-def of the BL).
154 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
155};
156
157struct FormalArgHandler : public AMDGPUIncomingArgHandler {
159 : AMDGPUIncomingArgHandler(B, MRI) {}
160
161 void markPhysRegUsed(unsigned PhysReg) override {
162 MIRBuilder.getMBB().addLiveIn(PhysReg);
163 }
164};
165
166struct CallReturnHandler : public AMDGPUIncomingArgHandler {
167 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
169 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
170
171 void markPhysRegUsed(unsigned PhysReg) override {
172 MIB.addDef(PhysReg, RegState::Implicit);
173 }
174
176};
177
178struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
179 /// For tail calls, the byte offset of the call's argument area from the
180 /// callee's. Unused elsewhere.
181 int FPDiff;
182
183 // Cache the SP register vreg if we need it more than once in this call site.
184 Register SPReg;
185
186 bool IsTailCall;
187
188 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
190 bool IsTailCall = false, int FPDiff = 0)
191 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
192 IsTailCall(IsTailCall) {}
193
194 Register getStackAddress(uint64_t Size, int64_t Offset,
196 ISD::ArgFlagsTy Flags) override {
197 MachineFunction &MF = MIRBuilder.getMF();
198 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
199 const LLT S32 = LLT::scalar(32);
200
201 if (IsTailCall) {
202 Offset += FPDiff;
203 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
204 auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
206 return FIReg.getReg(0);
207 }
208
210
211 if (!SPReg) {
212 const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
213 if (ST.enableFlatScratch()) {
214 // The stack is accessed unswizzled, so we can use a regular copy.
215 SPReg = MIRBuilder.buildCopy(PtrTy,
216 MFI->getStackPtrOffsetReg()).getReg(0);
217 } else {
218 // The address we produce here, without knowing the use context, is going
219 // to be interpreted as a vector address, so we need to convert to a
220 // swizzled address.
221 SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
222 {MFI->getStackPtrOffsetReg()}).getReg(0);
223 }
224 }
225
226 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
227
228 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
230 return AddrReg.getReg(0);
231 }
232
233 void assignValueToReg(Register ValVReg, Register PhysReg,
234 const CCValAssign &VA) override {
235 MIB.addUse(PhysReg, RegState::Implicit);
236 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
237 MIRBuilder.buildCopy(PhysReg, ExtReg);
238 }
239
240 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
241 const MachinePointerInfo &MPO,
242 const CCValAssign &VA) override {
243 MachineFunction &MF = MIRBuilder.getMF();
244 uint64_t LocMemOffset = VA.getLocMemOffset();
245 const auto &ST = MF.getSubtarget<GCNSubtarget>();
246
247 auto MMO = MF.getMachineMemOperand(
248 MPO, MachineMemOperand::MOStore, MemTy,
249 commonAlignment(ST.getStackAlignment(), LocMemOffset));
250 MIRBuilder.buildStore(ValVReg, Addr, *MMO);
251 }
252
253 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
254 unsigned ValRegIndex, Register Addr, LLT MemTy,
255 const MachinePointerInfo &MPO,
256 const CCValAssign &VA) override {
257 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
258 ? extendRegister(Arg.Regs[ValRegIndex], VA)
259 : Arg.Regs[ValRegIndex];
260 assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
261 }
262};
263} // anonymous namespace
264
266 : CallLowering(&TLI) {
267}
268
269// FIXME: Compatibility shim
271 switch (MIOpc) {
272 case TargetOpcode::G_SEXT:
273 return ISD::SIGN_EXTEND;
274 case TargetOpcode::G_ZEXT:
275 return ISD::ZERO_EXTEND;
276 case TargetOpcode::G_ANYEXT:
277 return ISD::ANY_EXTEND;
278 default:
279 llvm_unreachable("not an extend opcode");
280 }
281}
282
283bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
284 CallingConv::ID CallConv,
286 bool IsVarArg) const {
287 // For shaders. Vector types should be explicitly handled by CC.
288 if (AMDGPU::isEntryFunctionCC(CallConv))
289 return true;
290
292 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
293 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
294 MF.getFunction().getContext());
295
296 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
297}
298
299/// Lower the return value for the already existing \p Ret. This assumes that
300/// \p B's insertion point is correct.
301bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
302 const Value *Val, ArrayRef<Register> VRegs,
303 MachineInstrBuilder &Ret) const {
304 if (!Val)
305 return true;
306
307 auto &MF = B.getMF();
308 const auto &F = MF.getFunction();
309 const DataLayout &DL = MF.getDataLayout();
310 MachineRegisterInfo *MRI = B.getMRI();
311 LLVMContext &Ctx = F.getContext();
312
313 CallingConv::ID CC = F.getCallingConv();
314 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
315
316 SmallVector<EVT, 8> SplitEVTs;
317 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
318 assert(VRegs.size() == SplitEVTs.size() &&
319 "For each split Type there should be exactly one VReg.");
320
321 SmallVector<ArgInfo, 8> SplitRetInfos;
322
323 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
324 EVT VT = SplitEVTs[i];
325 Register Reg = VRegs[i];
326 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0);
328
329 if (VT.isScalarInteger()) {
330 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
331 if (RetInfo.Flags[0].isSExt()) {
332 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
333 ExtendOp = TargetOpcode::G_SEXT;
334 } else if (RetInfo.Flags[0].isZExt()) {
335 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
336 ExtendOp = TargetOpcode::G_ZEXT;
337 }
338
339 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
340 extOpcodeToISDExtOpcode(ExtendOp));
341 if (ExtVT != VT) {
342 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
343 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
344 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
345 }
346 }
347
348 if (Reg != RetInfo.Regs[0]) {
349 RetInfo.Regs[0] = Reg;
350 // Reset the arg flags after modifying Reg.
352 }
353
354 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
355 }
356
357 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
358
359 OutgoingValueAssigner Assigner(AssignFn);
360 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
361 return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
362 CC, F.isVarArg());
363}
364
366 ArrayRef<Register> VRegs,
367 FunctionLoweringInfo &FLI) const {
368
369 MachineFunction &MF = B.getMF();
371 MFI->setIfReturnsVoid(!Val);
372
373 assert(!Val == VRegs.empty() && "Return value without a vreg");
374
375 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
376 const bool IsShader = AMDGPU::isShader(CC);
377 const bool IsWaveEnd =
378 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
379 if (IsWaveEnd) {
380 B.buildInstr(AMDGPU::S_ENDPGM)
381 .addImm(0);
382 return true;
383 }
384
385 unsigned ReturnOpc =
386 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
387 auto Ret = B.buildInstrNoInsert(ReturnOpc);
388
389 if (!FLI.CanLowerReturn)
390 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
391 else if (!lowerReturnVal(B, Val, VRegs, Ret))
392 return false;
393
394 // TODO: Handle CalleeSavedRegsViaCopy.
395
396 B.insertInstr(Ret);
397 return true;
398}
399
400void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
401 uint64_t Offset) const {
402 MachineFunction &MF = B.getMF();
405 Register KernArgSegmentPtr =
407 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
408
409 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
410
411 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
412}
413
414void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
416 Align Alignment) const {
417 MachineFunction &MF = B.getMF();
418 const Function &F = MF.getFunction();
419 const DataLayout &DL = F.getDataLayout();
421
423
424 SmallVector<ArgInfo, 32> SplitArgs;
425 SmallVector<uint64_t> FieldOffsets;
426 splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
427
428 unsigned Idx = 0;
429 for (ArgInfo &SplitArg : SplitArgs) {
430 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
431 lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
432
433 LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
434 if (SplitArg.Flags[0].isPointer()) {
435 // Compensate for losing pointeriness in splitValueTypes.
436 LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
437 ArgTy.getScalarSizeInBits());
438 ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy)
439 : PtrTy;
440 }
441
443 PtrInfo,
446 ArgTy, commonAlignment(Alignment, FieldOffsets[Idx]));
447
448 assert(SplitArg.Regs.size() == 1);
449
450 B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO);
451 ++Idx;
452 }
453}
454
455// Allocate special inputs passed in user SGPRs.
456static void allocateHSAUserSGPRs(CCState &CCInfo,
458 MachineFunction &MF,
459 const SIRegisterInfo &TRI,
460 SIMachineFunctionInfo &Info) {
461 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
462 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
463 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
464 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
465 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
466 CCInfo.AllocateReg(PrivateSegmentBufferReg);
467 }
468
469 if (UserSGPRInfo.hasDispatchPtr()) {
470 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
471 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
472 CCInfo.AllocateReg(DispatchPtrReg);
473 }
474
475 const Module *M = MF.getFunction().getParent();
476 if (UserSGPRInfo.hasQueuePtr() &&
478 Register QueuePtrReg = Info.addQueuePtr(TRI);
479 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
480 CCInfo.AllocateReg(QueuePtrReg);
481 }
482
483 if (UserSGPRInfo.hasKernargSegmentPtr()) {
485 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
487 Register VReg = MRI.createGenericVirtualRegister(P4);
488 MRI.addLiveIn(InputPtrReg, VReg);
489 B.getMBB().addLiveIn(InputPtrReg);
490 B.buildCopy(VReg, InputPtrReg);
491 CCInfo.AllocateReg(InputPtrReg);
492 }
493
494 if (UserSGPRInfo.hasDispatchID()) {
495 Register DispatchIDReg = Info.addDispatchID(TRI);
496 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
497 CCInfo.AllocateReg(DispatchIDReg);
498 }
499
500 if (UserSGPRInfo.hasFlatScratchInit()) {
501 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
502 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
503 CCInfo.AllocateReg(FlatScratchInitReg);
504 }
505
506 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
507 // these from the dispatch pointer.
508}
509
511 MachineIRBuilder &B, const Function &F,
512 ArrayRef<ArrayRef<Register>> VRegs) const {
513 MachineFunction &MF = B.getMF();
514 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
517 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
518 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
519 const DataLayout &DL = F.getDataLayout();
520
522 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
523
524 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
525
526 unsigned i = 0;
527 const Align KernArgBaseAlign(16);
528 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
529 uint64_t ExplicitArgOffset = 0;
530
531 // TODO: Align down to dword alignment and extract bits for extending loads.
532 for (auto &Arg : F.args()) {
533 const bool IsByRef = Arg.hasByRefAttr();
534 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
535 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
536 if (AllocSize == 0)
537 continue;
538
539 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
540 Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
541
542 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
543 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
544
545 if (Arg.use_empty()) {
546 ++i;
547 continue;
548 }
549
550 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
551
552 if (IsByRef) {
553 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
554
555 assert(VRegs[i].size() == 1 &&
556 "expected only one register for byval pointers");
557 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
558 lowerParameterPtr(VRegs[i][0], B, ArgOffset);
559 } else {
560 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
561 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
562 lowerParameterPtr(PtrReg, B, ArgOffset);
563
564 B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
565 }
566 } else {
567 ArgInfo OrigArg(VRegs[i], Arg, i);
568 const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
569 setArgFlags(OrigArg, OrigArgIdx, DL, F);
570 lowerParameter(B, OrigArg, ArgOffset, Alignment);
571 }
572
573 ++i;
574 }
575
576 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
577 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
578 return true;
579}
580
583 FunctionLoweringInfo &FLI) const {
584 CallingConv::ID CC = F.getCallingConv();
585
586 // The infrastructure for normal calling convention lowering is essentially
587 // useless for kernels. We want to avoid any kind of legalization or argument
588 // splitting.
590 return lowerFormalArgumentsKernel(B, F, VRegs);
591
592 const bool IsGraphics = AMDGPU::isGraphics(CC);
593 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
594
595 MachineFunction &MF = B.getMF();
596 MachineBasicBlock &MBB = B.getMBB();
599 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
600 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
601 const DataLayout &DL = F.getDataLayout();
602
604 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
605 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
606
607 if (UserSGPRInfo.hasImplicitBufferPtr()) {
608 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
609 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
610 CCInfo.AllocateReg(ImplicitBufferPtrReg);
611 }
612
613 // FIXME: This probably isn't defined for mesa
614 if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
615 Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
616 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
617 CCInfo.AllocateReg(FlatScratchInitReg);
618 }
619
620 SmallVector<ArgInfo, 32> SplitArgs;
621 unsigned Idx = 0;
622 unsigned PSInputNum = 0;
623
624 // Insert the hidden sret parameter if the return value won't fit in the
625 // return registers.
626 if (!FLI.CanLowerReturn)
628
629 for (auto &Arg : F.args()) {
630 if (DL.getTypeStoreSize(Arg.getType()) == 0)
631 continue;
632
633 const bool InReg = Arg.hasAttribute(Attribute::InReg);
634
635 if (Arg.hasAttribute(Attribute::SwiftSelf) ||
636 Arg.hasAttribute(Attribute::SwiftError) ||
637 Arg.hasAttribute(Attribute::Nest))
638 return false;
639
640 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
641 const bool ArgUsed = !Arg.use_empty();
642 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
643
644 if (!SkipArg) {
645 Info->markPSInputAllocated(PSInputNum);
646 if (ArgUsed)
647 Info->markPSInputEnabled(PSInputNum);
648 }
649
650 ++PSInputNum;
651
652 if (SkipArg) {
653 for (Register R : VRegs[Idx])
654 B.buildUndef(R);
655
656 ++Idx;
657 continue;
658 }
659 }
660
661 ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
662 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
663 setArgFlags(OrigArg, OrigArgIdx, DL, F);
664
665 splitToValueTypes(OrigArg, SplitArgs, DL, CC);
666 ++Idx;
667 }
668
669 // At least one interpolation mode must be enabled or else the GPU will
670 // hang.
671 //
672 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
673 // set PSInputAddr, the user wants to enable some bits after the compilation
674 // based on run-time states. Since we can't know what the final PSInputEna
675 // will look like, so we shouldn't do anything here and the user should take
676 // responsibility for the correct programming.
677 //
678 // Otherwise, the following restrictions apply:
679 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
680 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
681 // enabled too.
682 if (CC == CallingConv::AMDGPU_PS) {
683 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
684 ((Info->getPSInputAddr() & 0xF) == 0 &&
685 Info->isPSInputAllocated(11))) {
686 CCInfo.AllocateReg(AMDGPU::VGPR0);
687 CCInfo.AllocateReg(AMDGPU::VGPR1);
688 Info->markPSInputAllocated(0);
689 Info->markPSInputEnabled(0);
690 }
691
692 if (Subtarget.isAmdPalOS()) {
693 // For isAmdPalOS, the user does not enable some bits after compilation
694 // based on run-time states; the register values being generated here are
695 // the final ones set in hardware. Therefore we need to apply the
696 // workaround to PSInputAddr and PSInputEnable together. (The case where
697 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
698 // set up an input arg for a particular interpolation mode, but nothing
699 // uses that input arg. Really we should have an earlier pass that removes
700 // such an arg.)
701 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
702 if ((PsInputBits & 0x7F) == 0 ||
703 ((PsInputBits & 0xF) == 0 &&
704 (PsInputBits >> 11 & 1)))
705 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
706 }
707 }
708
709 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
710 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
711
712 if (!MBB.empty())
713 B.setInstr(*MBB.begin());
714
715 if (!IsEntryFunc && !IsGraphics) {
716 // For the fixed ABI, pass workitem IDs in the last argument register.
717 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
718
719 if (!Subtarget.enableFlatScratch())
720 CCInfo.AllocateReg(Info->getScratchRSrcReg());
721 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
722 }
723
724 IncomingValueAssigner Assigner(AssignFn);
725 if (!determineAssignments(Assigner, SplitArgs, CCInfo))
726 return false;
727
728 FormalArgHandler Handler(B, MRI);
729 if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
730 return false;
731
732 uint64_t StackSize = Assigner.StackSize;
733
734 // Start adding system SGPRs.
735 if (IsEntryFunc)
736 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
737
738 // When we tail call, we need to check if the callee's arguments will fit on
739 // the caller's stack. So, whenever we lower formal arguments, we should keep
740 // track of this information, since we might lower a tail call in this
741 // function later.
742 Info->setBytesInStackArgArea(StackSize);
743
744 // Move back to the end of the basic block.
745 B.setMBB(MBB);
746
747 return true;
748}
749
751 CCState &CCInfo,
752 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
753 CallLoweringInfo &Info) const {
754 MachineFunction &MF = MIRBuilder.getMF();
755
756 // If there's no call site, this doesn't correspond to a call from the IR and
757 // doesn't need implicit inputs.
758 if (!Info.CB)
759 return true;
760
761 const AMDGPUFunctionArgInfo *CalleeArgInfo
763
765 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
766
767
768 // TODO: Unify with private memory register handling. This is complicated by
769 // the fact that at least in kernels, the input argument is not necessarily
770 // in the same location as the input.
780 };
781
782 static constexpr StringLiteral ImplicitAttrNames[] = {
783 "amdgpu-no-dispatch-ptr",
784 "amdgpu-no-queue-ptr",
785 "amdgpu-no-implicitarg-ptr",
786 "amdgpu-no-dispatch-id",
787 "amdgpu-no-workgroup-id-x",
788 "amdgpu-no-workgroup-id-y",
789 "amdgpu-no-workgroup-id-z",
790 "amdgpu-no-lds-kernel-id",
791 };
792
794
795 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
796 const AMDGPULegalizerInfo *LI
797 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
798
799 unsigned I = 0;
800 for (auto InputID : InputRegs) {
801 const ArgDescriptor *OutgoingArg;
802 const TargetRegisterClass *ArgRC;
803 LLT ArgTy;
804
805 // If the callee does not use the attribute value, skip copying the value.
806 if (Info.CB->hasFnAttr(ImplicitAttrNames[I++]))
807 continue;
808
809 std::tie(OutgoingArg, ArgRC, ArgTy) =
810 CalleeArgInfo->getPreloadedValue(InputID);
811 if (!OutgoingArg)
812 continue;
813
814 const ArgDescriptor *IncomingArg;
815 const TargetRegisterClass *IncomingArgRC;
816 std::tie(IncomingArg, IncomingArgRC, ArgTy) =
817 CallerArgInfo.getPreloadedValue(InputID);
818 assert(IncomingArgRC == ArgRC);
819
820 Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
821
822 if (IncomingArg) {
823 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
824 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
825 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
826 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
827 std::optional<uint32_t> Id =
829 if (Id) {
830 MIRBuilder.buildConstant(InputReg, *Id);
831 } else {
832 MIRBuilder.buildUndef(InputReg);
833 }
834 } else {
835 // We may have proven the input wasn't needed, although the ABI is
836 // requiring it. We just need to allocate the register appropriately.
837 MIRBuilder.buildUndef(InputReg);
838 }
839
840 if (OutgoingArg->isRegister()) {
841 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
842 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
843 report_fatal_error("failed to allocate implicit input argument");
844 } else {
845 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
846 return false;
847 }
848 }
849
850 // Pack workitem IDs into a single register or pass it as is if already
851 // packed.
852 const ArgDescriptor *OutgoingArg;
853 const TargetRegisterClass *ArgRC;
854 LLT ArgTy;
855
856 std::tie(OutgoingArg, ArgRC, ArgTy) =
858 if (!OutgoingArg)
859 std::tie(OutgoingArg, ArgRC, ArgTy) =
861 if (!OutgoingArg)
862 std::tie(OutgoingArg, ArgRC, ArgTy) =
864 if (!OutgoingArg)
865 return false;
866
867 auto WorkitemIDX =
868 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
869 auto WorkitemIDY =
870 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
871 auto WorkitemIDZ =
872 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
873
874 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
875 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
876 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
877 const LLT S32 = LLT::scalar(32);
878
879 const bool NeedWorkItemIDX = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-x");
880 const bool NeedWorkItemIDY = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-y");
881 const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-z");
882
883 // If incoming ids are not packed we need to pack them.
884 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
885 Register InputReg;
886 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
887 NeedWorkItemIDX) {
888 if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
889 InputReg = MRI.createGenericVirtualRegister(S32);
890 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
891 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
892 } else {
893 InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
894 }
895 }
896
897 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
898 NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
899 Register Y = MRI.createGenericVirtualRegister(S32);
900 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
901 std::get<2>(WorkitemIDY));
902
903 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
904 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
905 }
906
907 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
908 NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
909 Register Z = MRI.createGenericVirtualRegister(S32);
910 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
911 std::get<2>(WorkitemIDZ));
912
913 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
914 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
915 }
916
917 if (!InputReg &&
918 (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
919 InputReg = MRI.createGenericVirtualRegister(S32);
920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
921 // We're in a situation where the outgoing function requires the workitem
922 // ID, but the calling function does not have it (e.g a graphics function
923 // calling a C calling convention function). This is illegal, but we need
924 // to produce something.
925 MIRBuilder.buildUndef(InputReg);
926 } else {
927 // Workitem ids are already packed, any of present incoming arguments will
928 // carry all required fields.
930 IncomingArgX ? *IncomingArgX :
931 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
932 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
933 &AMDGPU::VGPR_32RegClass, S32);
934 }
935 }
936
937 if (OutgoingArg->isRegister()) {
938 if (InputReg)
939 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
940
941 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
942 report_fatal_error("failed to allocate implicit input argument");
943 } else {
944 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
945 return false;
946 }
947
948 return true;
949}
950
951/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
952/// CC.
953static std::pair<CCAssignFn *, CCAssignFn *>
955 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
956}
957
958static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
959 bool IsTailCall, bool isWave32,
961 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
962 assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
963 "Indirect calls can't be tail calls, "
964 "because the address can be divergent");
965 if (!IsTailCall)
966 return AMDGPU::G_SI_CALL;
967
969 return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
970
971 return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
972 AMDGPU::SI_TCRETURN;
973}
974
975// Add operands to call instruction to track the callee.
977 MachineIRBuilder &MIRBuilder,
979 if (Info.Callee.isReg()) {
980 CallInst.addReg(Info.Callee.getReg());
981 CallInst.addImm(0);
982 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
983 // The call lowering lightly assumed we can directly encode a call target in
984 // the instruction, which is not the case. Materialize the address here.
985 const GlobalValue *GV = Info.Callee.getGlobal();
986 auto Ptr = MIRBuilder.buildGlobalValue(
987 LLT::pointer(GV->getAddressSpace(), 64), GV);
988 CallInst.addReg(Ptr.getReg(0));
989 CallInst.add(Info.Callee);
990 } else
991 return false;
992
993 return true;
994}
995
998 SmallVectorImpl<ArgInfo> &InArgs) const {
999 const Function &CallerF = MF.getFunction();
1000 CallingConv::ID CalleeCC = Info.CallConv;
1001 CallingConv::ID CallerCC = CallerF.getCallingConv();
1002
1003 // If the calling conventions match, then everything must be the same.
1004 if (CalleeCC == CallerCC)
1005 return true;
1006
1007 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1008
1009 // Make sure that the caller and callee preserve all of the same registers.
1010 auto TRI = ST.getRegisterInfo();
1011
1012 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1013 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1014 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
1015 return false;
1016
1017 // Check if the caller and callee will handle arguments in the same way.
1018 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1019 CCAssignFn *CalleeAssignFnFixed;
1020 CCAssignFn *CalleeAssignFnVarArg;
1021 std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
1022 getAssignFnsForCC(CalleeCC, TLI);
1023
1024 CCAssignFn *CallerAssignFnFixed;
1025 CCAssignFn *CallerAssignFnVarArg;
1026 std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
1027 getAssignFnsForCC(CallerCC, TLI);
1028
1029 // FIXME: We are not accounting for potential differences in implicitly passed
1030 // inputs, but only the fixed ABI is supported now anyway.
1031 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1032 CalleeAssignFnVarArg);
1033 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1034 CallerAssignFnVarArg);
1035 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1036}
1037
1040 SmallVectorImpl<ArgInfo> &OutArgs) const {
1041 // If there are no outgoing arguments, then we are done.
1042 if (OutArgs.empty())
1043 return true;
1044
1045 const Function &CallerF = MF.getFunction();
1046 CallingConv::ID CalleeCC = Info.CallConv;
1047 CallingConv::ID CallerCC = CallerF.getCallingConv();
1048 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1049
1050 CCAssignFn *AssignFnFixed;
1051 CCAssignFn *AssignFnVarArg;
1052 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1053
1054 // We have outgoing arguments. Make sure that we can tail call with them.
1056 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1057 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1058
1059 if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
1060 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1061 return false;
1062 }
1063
1064 // Make sure that they can fit on the caller's stack.
1065 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1066 if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1067 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1068 return false;
1069 }
1070
1071 // Verify that the parameters in callee-saved registers match.
1072 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1073 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1076 return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
1077}
1078
1079/// Return true if the calling convention is one that we can guarantee TCO for.
1081 return CC == CallingConv::Fast;
1082}
1083
1084/// Return true if we might ever do TCO for calls with this calling convention.
1086 switch (CC) {
1087 case CallingConv::C:
1089 return true;
1090 default:
1091 return canGuaranteeTCO(CC);
1092 }
1093}
1094
1097 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1098 // Must pass all target-independent checks in order to tail call optimize.
1099 if (!Info.IsTailCall)
1100 return false;
1101
1102 // Indirect calls can't be tail calls, because the address can be divergent.
1103 // TODO Check divergence info if the call really is divergent.
1104 if (Info.Callee.isReg())
1105 return false;
1106
1107 MachineFunction &MF = B.getMF();
1108 const Function &CallerF = MF.getFunction();
1109 CallingConv::ID CalleeCC = Info.CallConv;
1110 CallingConv::ID CallerCC = CallerF.getCallingConv();
1111
1112 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1113 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1114 // Kernels aren't callable, and don't have a live in return address so it
1115 // doesn't make sense to do a tail call with entry functions.
1116 if (!CallerPreserved)
1117 return false;
1118
1119 if (!mayTailCallThisCC(CalleeCC)) {
1120 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1121 return false;
1122 }
1123
1124 if (any_of(CallerF.args(), [](const Argument &A) {
1125 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1126 })) {
1127 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1128 "or swifterror arguments\n");
1129 return false;
1130 }
1131
1132 // If we have -tailcallopt, then we're done.
1134 return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
1135
1136 // Verify that the incoming and outgoing arguments from the callee are
1137 // safe to tail call.
1138 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1139 LLVM_DEBUG(
1140 dbgs()
1141 << "... Caller and callee have incompatible calling conventions.\n");
1142 return false;
1143 }
1144
1145 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1146 return false;
1147
1148 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1149 return true;
1150}
1151
1152// Insert outgoing implicit arguments for a call, by inserting copies to the
1153// implicit argument registers and adding the necessary implicit uses to the
1154// call instruction.
1157 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1158 CallingConv::ID CalleeCC,
1159 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1160 if (!ST.enableFlatScratch()) {
1161 // Insert copies for the SRD. In the HSA case, this should be an identity
1162 // copy.
1163 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
1164 FuncInfo.getScratchRSrcReg());
1165
1166 auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
1167 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1168 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1169
1170 MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
1171 CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
1172 }
1173
1174 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1175 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1176 CallInst.addReg(ArgReg.first, RegState::Implicit);
1177 }
1178}
1179
1181 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1182 SmallVectorImpl<ArgInfo> &OutArgs) const {
1183 MachineFunction &MF = MIRBuilder.getMF();
1184 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186 const Function &F = MF.getFunction();
1188 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1189
1190 // True when we're tail calling, but without -tailcallopt.
1191 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1192
1193 // Find out which ABI gets to decide where things go.
1194 CallingConv::ID CalleeCC = Info.CallConv;
1195 CCAssignFn *AssignFnFixed;
1196 CCAssignFn *AssignFnVarArg;
1197 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1198
1199 MachineInstrBuilder CallSeqStart;
1200 if (!IsSibCall)
1201 CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1202
1203 unsigned Opc =
1204 getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
1205 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1206 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1207 return false;
1208
1209 // Byte offset for the tail call. When we are sibcalling, this will always
1210 // be 0.
1211 MIB.addImm(0);
1212
1213 // If this is a chain call, we need to pass in the EXEC mask.
1214 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215 if (AMDGPU::isChainCC(Info.CallConv)) {
1216 ArgInfo ExecArg = Info.OrigArgs[1];
1217 assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1218
1219 if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
1220 return false;
1221
1222 if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
1223 MIB.addImm(CI->getSExtValue());
1224 } else {
1225 MIB.addReg(ExecArg.Regs[0]);
1226 unsigned Idx = MIB->getNumOperands() - 1;
1227 MIB->getOperand(Idx).setReg(constrainOperandRegClass(
1228 MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1229 MIB->getDesc(), MIB->getOperand(Idx), Idx));
1230 }
1231 }
1232
1233 // Tell the call which registers are clobbered.
1234 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1235 MIB.addRegMask(Mask);
1236
1237 // FPDiff is the byte offset of the call's argument area from the callee's.
1238 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1239 // by this amount for a tail call. In a sibling call it must be 0 because the
1240 // caller will deallocate the entire stack and the callee still expects its
1241 // arguments to begin at SP+0.
1242 int FPDiff = 0;
1243
1244 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1245 // by -tailcallopt. For sibcalls, the memory operands for the call are
1246 // already available in the caller's incoming argument space.
1247 unsigned NumBytes = 0;
1248 if (!IsSibCall) {
1249 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1250 // before handling assignments, because FPDiff must be known for memory
1251 // arguments.
1252 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1254 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1255
1256 // FIXME: Not accounting for callee implicit inputs
1257 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1258 if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
1259 return false;
1260
1261 // The callee will pop the argument stack as a tail call. Thus, we must
1262 // keep it 16-byte aligned.
1263 NumBytes = alignTo(OutInfo.getStackSize(), ST.getStackAlignment());
1264
1265 // FPDiff will be negative if this tail call requires more space than we
1266 // would automatically have in our incoming argument space. Positive if we
1267 // actually shrink the stack.
1268 FPDiff = NumReusableBytes - NumBytes;
1269
1270 // The stack pointer must be 16-byte aligned at all times it's used for a
1271 // memory operation, which in practice means at *all* times and in
1272 // particular across call boundaries. Therefore our own arguments started at
1273 // a 16-byte aligned SP and the delta applied for the tail call should
1274 // satisfy the same constraint.
1275 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1276 "unaligned stack on tail call");
1277 }
1278
1280 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1281
1282 // We could pass MIB and directly add the implicit uses to the call
1283 // now. However, as an aesthetic choice, place implicit argument operands
1284 // after the ordinary user argument registers.
1286
1287 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1288 !AMDGPU::isChainCC(Info.CallConv)) {
1289 // With a fixed ABI, allocate fixed registers before user arguments.
1290 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1291 return false;
1292 }
1293
1294 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1295
1296 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1297 return false;
1298
1299 // Do the actual argument marshalling.
1300 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1301 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1302 return false;
1303
1304 if (Info.ConvergenceCtrlToken) {
1305 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1306 }
1307 handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
1308 ImplicitArgRegs);
1309
1310 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1311 // sequence start and end here.
1312 if (!IsSibCall) {
1313 MIB->getOperand(1).setImm(FPDiff);
1314 CallSeqStart.addImm(NumBytes).addImm(0);
1315 // End the call sequence *before* emitting the call. Normally, we would
1316 // tidy the frame up after the call. However, here, we've laid out the
1317 // parameters so that when SP is reset, they will be in the correct
1318 // location.
1319 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
1320 }
1321
1322 // Now we can add the actual call instruction to the correct basic block.
1323 MIRBuilder.insertInstr(MIB);
1324
1325 // If Callee is a reg, since it is used by a target specific
1326 // instruction, it must have a register class matching the
1327 // constraint of that instruction.
1328
1329 // FIXME: We should define regbankselectable call instructions to handle
1330 // divergent call targets.
1331 if (MIB->getOperand(0).isReg()) {
1333 MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1334 MIB->getDesc(), MIB->getOperand(0), 0));
1335 }
1336
1338 Info.LoweredTailCall = true;
1339 return true;
1340}
1341
1342/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1344 CallLoweringInfo &Info) const {
1345 ArgInfo Callee = Info.OrigArgs[0];
1346 ArgInfo SGPRArgs = Info.OrigArgs[2];
1347 ArgInfo VGPRArgs = Info.OrigArgs[3];
1348 ArgInfo Flags = Info.OrigArgs[4];
1349
1350 assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
1351 "Non-zero flags aren't supported yet.");
1352 assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
1353
1354 MachineFunction &MF = MIRBuilder.getMF();
1355 const Function &F = MF.getFunction();
1356 const DataLayout &DL = F.getDataLayout();
1357
1358 // The function to jump to is actually the first argument, so we'll change the
1359 // Callee and other info to match that before using our existing helper.
1360 const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1361 if (const Function *F = dyn_cast<Function>(CalleeV)) {
1362 Info.Callee = MachineOperand::CreateGA(F, 0);
1363 Info.CallConv = F->getCallingConv();
1364 } else {
1365 assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1366 Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
1367 Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1368 // behaves the same here.
1369 }
1370
1371 // The function that we're calling cannot be vararg (only the intrinsic is).
1372 Info.IsVarArg = false;
1373
1374 assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
1375 [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1376 "SGPR arguments should be marked inreg");
1377 assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
1378 [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1379 "VGPR arguments should not be marked inreg");
1380
1382 splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
1383 splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
1384
1385 Info.IsMustTailCall = true;
1386 return lowerTailCall(MIRBuilder, Info, OutArgs);
1387}
1388
1390 CallLoweringInfo &Info) const {
1391 if (Function *F = Info.CB->getCalledFunction())
1392 if (F->isIntrinsic()) {
1393 assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1394 "Unexpected intrinsic");
1395 return lowerChainCall(MIRBuilder, Info);
1396 }
1397
1398 if (Info.IsVarArg) {
1399 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1400 return false;
1401 }
1402
1403 MachineFunction &MF = MIRBuilder.getMF();
1404 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1405 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1406
1407 const Function &F = MF.getFunction();
1409 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1410 const DataLayout &DL = F.getDataLayout();
1411
1413 for (auto &OrigArg : Info.OrigArgs)
1414 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
1415
1417 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1418 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
1419
1420 // If we can lower as a tail call, do that instead.
1421 bool CanTailCallOpt =
1422 isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
1423
1424 // We must emit a tail call if we have musttail.
1425 if (Info.IsMustTailCall && !CanTailCallOpt) {
1426 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1427 return false;
1428 }
1429
1430 Info.IsTailCall = CanTailCallOpt;
1431 if (CanTailCallOpt)
1432 return lowerTailCall(MIRBuilder, Info, OutArgs);
1433
1434 // Find out which ABI gets to decide where things go.
1435 CCAssignFn *AssignFnFixed;
1436 CCAssignFn *AssignFnVarArg;
1437 std::tie(AssignFnFixed, AssignFnVarArg) =
1438 getAssignFnsForCC(Info.CallConv, TLI);
1439
1440 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1441 .addImm(0)
1442 .addImm(0);
1443
1444 // Create a temporarily-floating call instruction so we can add the implicit
1445 // uses of arg registers.
1446 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
1447 Info.CallConv);
1448
1449 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1450 MIB.addDef(TRI->getReturnAddressReg(MF));
1451
1452 if (!Info.IsConvergent)
1454
1455 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1456 return false;
1457
1458 // Tell the call which registers are clobbered.
1459 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1460 MIB.addRegMask(Mask);
1461
1463 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1464
1465 // We could pass MIB and directly add the implicit uses to the call
1466 // now. However, as an aesthetic choice, place implicit argument operands
1467 // after the ordinary user argument registers.
1469
1470 if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1471 // With a fixed ABI, allocate fixed registers before user arguments.
1472 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1473 return false;
1474 }
1475
1476 // Do the actual argument marshalling.
1477 SmallVector<Register, 8> PhysRegs;
1478
1479 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1480 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1481 return false;
1482
1483 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1484 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1485 return false;
1486
1488
1489 if (Info.ConvergenceCtrlToken) {
1490 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1491 }
1492 handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
1493 ImplicitArgRegs);
1494
1495 // Get a count of how many bytes are to be pushed on the stack.
1496 unsigned NumBytes = CCInfo.getStackSize();
1497
1498 // If Callee is a reg, since it is used by a target specific
1499 // instruction, it must have a register class matching the
1500 // constraint of that instruction.
1501
1502 // FIXME: We should define regbankselectable call instructions to handle
1503 // divergent call targets.
1504 if (MIB->getOperand(1).isReg()) {
1505 MIB->getOperand(1).setReg(constrainOperandRegClass(
1506 MF, *TRI, MRI, *ST.getInstrInfo(),
1507 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1508 1));
1509 }
1510
1511 // Now we can add the actual call instruction to the correct position.
1512 MIRBuilder.insertInstr(MIB);
1513
1514 // Finally we can copy the returned value back into its virtual-register. In
1515 // symmetry with the arguments, the physical register must be an
1516 // implicit-define of the call instruction.
1517 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1518 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1519 Info.IsVarArg);
1520 IncomingValueAssigner Assigner(RetAssignFn);
1521 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1522 if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
1523 Info.CallConv, Info.IsVarArg))
1524 return false;
1525 }
1526
1527 uint64_t CalleePopBytes = NumBytes;
1528
1529 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1530 .addImm(0)
1531 .addImm(CalleePopBytes);
1532
1533 if (!Info.CanLowerReturn) {
1534 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1535 Info.DemoteRegister, Info.DemoteStackIndex);
1536 }
1537
1538 return true;
1539}
unsigned const MachineRegisterInfo * MRI
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall, std::optional< CallLowering::PtrAuthInfo > &PAI, MachineRegisterInfo &MRI)
static std::pair< CCAssignFn *, CCAssignFn * > getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI)
Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for CC.
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static bool addCallTargetOperands(MachineInstrBuilder &CallInst, MachineIRBuilder &MIRBuilder, AMDGPUCallLowering::CallLoweringInfo &Info)
static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc)
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
This file describes how to lower LLVM calls to machine code calls.
static const LLT S32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &OutArgs) const
bool isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &InArgs, SmallVectorImpl< ArgInfo > &OutArgs) const
Returns true if the call can be lowered as a tail call.
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs) const
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs, FunctionLoweringInfo &FLI) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
void handleImplicitCallArguments(MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, CallingConv::ID CalleeCC, ArrayRef< std::pair< MCRegister, Register > > ImplicitArgRegs) const
bool areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &OutArgs) const
bool lowerChainCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const
Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
bool passSpecialInputs(MachineIRBuilder &MIRBuilder, CCState &CCInfo, SmallVectorImpl< std::pair< MCRegister, Register > > &ArgRegs, CallLoweringInfo &Info) const
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs, FunctionLoweringInfo &FLI) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs,...
bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override
This hook must be implemented to lower the given call instruction, including argument and return valu...
bool doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
CCState - This class holds information needed while lowering arguments and return values.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
This class represents a function call, abstracting a target machine's calling convention.
bool handleAssignments(ValueHandler &Handler, SmallVectorImpl< ArgInfo > &Args, CCState &CCState, SmallVectorImpl< CCValAssign > &ArgLocs, MachineIRBuilder &MIRBuilder, ArrayRef< Register > ThisReturnRegs=std::nullopt) const
Use Handler to insert code to handle the argument/return values represented by Args.
void insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg, int FI) const
Load the returned value from the stack into virtual registers in VRegs.
bool determineAndHandleAssignments(ValueHandler &Handler, ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, bool IsVarArg, ArrayRef< Register > ThisReturnRegs=std::nullopt) const
Invoke ValueAssigner::assignArg on each of the given Args and then use Handler to move them to the as...
bool resultsCompatible(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs, ValueAssigner &CalleeAssigner, ValueAssigner &CallerAssigner) const
void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl< ArgInfo > &SplitArgs, const DataLayout &DL, CallingConv::ID CallConv, SmallVectorImpl< uint64_t > *Offsets=nullptr) const
Break OrigArgInfo into one or more pieces the calling convention can process, returned in SplitArgs.
void insertSRetIncomingArgument(const Function &F, SmallVectorImpl< ArgInfo > &SplitArgs, Register &DemoteReg, MachineRegisterInfo &MRI, const DataLayout &DL) const
Insert the hidden sret ArgInfo to the beginning of SplitArgs.
void insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg) const
Store the return value given by VRegs into stack starting at the offset specified in DemoteReg.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ArgInfo > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
bool determineAssignments(ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, CCState &CCInfo) const
Analyze the argument list in Args, using Assigner to populate CCInfo.
bool checkReturn(CCState &CCInfo, SmallVectorImpl< BaseArgInfo > &Outs, CCAssignFn *Fn) const
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
bool CanLowerReturn
CanLowerReturn - true iff the function's return value can be lowered to registers.
iterator_range< arg_iterator > args()
Definition: Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void setHasTailCall(bool V=true)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
Helper class to build MachineInstr.
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
MachineInstrBuilder buildGlobalValue(const DstOp &Res, const GlobalValue *GV)
Build and insert Res = G_GLOBAL_VALUE GV.
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
MachineInstrBuilder buildPtrAdd(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_PTR_ADD Op0, Op1.
MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert G_STORE Val, Addr, MMO.
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
MachineInstrBuilder buildFrameIndex(const DstOp &Res, int Idx)
Build and insert Res = G_FRAME_INDEX Idx.
MachineFunction & getMF()
Getter for the function we currently build.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_OR Op0, Op1.
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don't insert <empty> = Opcode <empty>.
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
AMDGPUFunctionArgInfo & getArgInfo()
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isShader(CallingConv::ID cc)
bool isGraphics(CallingConv::ID cc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition: Alignment.h:145
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO)
Definition: Utils.cpp:893
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
const Value * OrigValue
Optionally track the original IR value for the argument.
Definition: CallLowering.h:73
SmallVector< Register, 4 > Regs
Definition: CallLowering.h:63
SmallVector< ISD::ArgFlagsTy, 4 > Flags
Definition: CallLowering.h:51
Base class for ValueHandlers used for arguments coming into the current function, or for return value...
Definition: CallLowering.h:331
void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) override
Provides a default implementation for argument handling.
Register buildExtensionHint(const CCValAssign &VA, Register SrcReg, LLT NarrowTy)
Insert G_ASSERT_ZEXT/G_ASSERT_SEXT or other hint instruction based on VA, returning the new register ...
Base class for ValueHandlers used for arguments passed to a function call, or for return values.
Definition: CallLowering.h:347
uint64_t StackSize
The size of the currently allocated portion of the stack.
Definition: CallLowering.h:217
virtual Register getStackAddress(uint64_t MemSize, int64_t Offset, MachinePointerInfo &MPO, ISD::ArgFlagsTy Flags)=0
Materialize a VReg containing the address of the specified stack-based object.
virtual void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, const MachinePointerInfo &MPO, const CCValAssign &VA)=0
The specified value has been assigned to a stack location.
Register extendRegister(Register ValReg, const CCValAssign &VA, unsigned MaxSizeBits=0)
Extend a register to the location type given in VA, capped at extending to at most MaxSize bits.
virtual void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA)=0
The specified value has been assigned to a physical register, handle the appropriate COPY (either to ...
Extended Value Type.
Definition: ValueTypes.h:34
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117