LLVM  12.0.0git
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 
27 #define DEBUG_TYPE "amdgpu-call-lowering"
28 
29 using namespace llvm;
30 
31 namespace {
32 
33 struct AMDGPUValueHandler : public CallLowering::ValueHandler {
34  AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
36  : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
37 
38  /// Wrapper around extendRegister to ensure we extend to a full 32-bit
39  /// register.
40  Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
41  if (VA.getLocVT().getSizeInBits() < 32) {
42  // 16-bit types are reported as legal for 32-bit registers. We need to
43  // extend and do a 32-bit copy to avoid the verifier complaining about it.
44  return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
45  }
46 
47  return extendRegister(ValVReg, VA);
48  }
49 };
50 
51 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
52  AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
53  MachineInstrBuilder MIB, CCAssignFn *AssignFn)
54  : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
55 
57 
58  Register getStackAddress(uint64_t Size, int64_t Offset,
59  MachinePointerInfo &MPO) override {
60  llvm_unreachable("not implemented");
61  }
62 
63  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
64  MachinePointerInfo &MPO, CCValAssign &VA) override {
65  llvm_unreachable("not implemented");
66  }
67 
68  void assignValueToReg(Register ValVReg, Register PhysReg,
69  CCValAssign &VA) override {
70  Register ExtReg = extendRegisterMin32(ValVReg, VA);
71 
72  // If this is a scalar return, insert a readfirstlane just in case the value
73  // ends up in a VGPR.
74  // FIXME: Assert this is a shader return.
75  const SIRegisterInfo *TRI
76  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
77  if (TRI->isSGPRReg(MRI, PhysReg)) {
78  auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
79  {MRI.getType(ExtReg)}, false)
80  .addReg(ExtReg);
81  ExtReg = ToSGPR.getReg(0);
82  }
83 
84  MIRBuilder.buildCopy(PhysReg, ExtReg);
85  MIB.addUse(PhysReg, RegState::Implicit);
86  }
87 
88  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
89  CCValAssign::LocInfo LocInfo,
91  ISD::ArgFlagsTy Flags,
92  CCState &State) override {
93  return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
94  }
95 };
96 
97 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
98  uint64_t StackUsed = 0;
99 
100  AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
101  CCAssignFn *AssignFn)
102  : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
103 
104  Register getStackAddress(uint64_t Size, int64_t Offset,
105  MachinePointerInfo &MPO) override {
106  auto &MFI = MIRBuilder.getMF().getFrameInfo();
107  int FI = MFI.CreateFixedObject(Size, Offset, true);
108  MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
109  auto AddrReg = MIRBuilder.buildFrameIndex(
111  StackUsed = std::max(StackUsed, Size + Offset);
112  return AddrReg.getReg(0);
113  }
114 
115  void assignValueToReg(Register ValVReg, Register PhysReg,
116  CCValAssign &VA) override {
117  markPhysRegUsed(PhysReg);
118 
119  if (VA.getLocVT().getSizeInBits() < 32) {
120  // 16-bit types are reported as legal for 32-bit registers. We need to do
121  // a 32-bit copy, and truncate to avoid the verifier complaining about it.
122  auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
123  MIRBuilder.buildTrunc(ValVReg, Copy);
124  return;
125  }
126 
127  switch (VA.getLocInfo()) {
128  case CCValAssign::LocInfo::SExt:
129  case CCValAssign::LocInfo::ZExt:
130  case CCValAssign::LocInfo::AExt: {
131  auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
132  MIRBuilder.buildTrunc(ValVReg, Copy);
133  break;
134  }
135  default:
136  MIRBuilder.buildCopy(ValVReg, PhysReg);
137  break;
138  }
139  }
140 
141  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
142  MachinePointerInfo &MPO, CCValAssign &VA) override {
143  MachineFunction &MF = MIRBuilder.getMF();
144 
145  // The reported memory location may be wider than the value.
146  const LLT RegTy = MRI.getType(ValVReg);
147  MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
148 
149  // FIXME: Get alignment
150  auto MMO = MF.getMachineMemOperand(
152  inferAlignFromPtrInfo(MF, MPO));
153  MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
154  }
155 
156  /// How the physical register gets marked varies between formal
157  /// parameters (it's a basic-block live-in), and a call instruction
158  /// (it's an implicit-def of the BL).
159  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
160 };
161 
162 struct FormalArgHandler : public AMDGPUIncomingArgHandler {
163  FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
164  CCAssignFn *AssignFn)
165  : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
166 
167  void markPhysRegUsed(unsigned PhysReg) override {
168  MIRBuilder.getMBB().addLiveIn(PhysReg);
169  }
170 };
171 
172 struct CallReturnHandler : public AMDGPUIncomingArgHandler {
173  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
174  MachineInstrBuilder MIB, CCAssignFn *AssignFn)
175  : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
176 
177  void markPhysRegUsed(unsigned PhysReg) override {
178  MIB.addDef(PhysReg, RegState::Implicit);
179  }
180 
182 };
183 
184 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
186  CCAssignFn *AssignFnVarArg;
187 
188  /// For tail calls, the byte offset of the call's argument area from the
189  /// callee's. Unused elsewhere.
190  int FPDiff;
191 
192  // Cache the SP register vreg if we need it more than once in this call site.
193  Register SPReg;
194 
195  bool IsTailCall;
196 
197  AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
199  CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
200  bool IsTailCall = false, int FPDiff = 0)
201  : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
202  AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
203  }
204 
205  Register getStackAddress(uint64_t Size, int64_t Offset,
206  MachinePointerInfo &MPO) override {
207  MachineFunction &MF = MIRBuilder.getMF();
208  const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
209  const LLT S32 = LLT::scalar(32);
210 
211  if (IsTailCall) {
212  llvm_unreachable("implement me");
213  }
214 
216 
217  if (!SPReg)
218  SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
219 
220  auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
221 
222  auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
224  return AddrReg.getReg(0);
225  }
226 
227  void assignValueToReg(Register ValVReg, Register PhysReg,
228  CCValAssign &VA) override {
229  MIB.addUse(PhysReg, RegState::Implicit);
230  Register ExtReg = extendRegisterMin32(ValVReg, VA);
231  MIRBuilder.buildCopy(PhysReg, ExtReg);
232  }
233 
234  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
235  MachinePointerInfo &MPO, CCValAssign &VA) override {
236  MachineFunction &MF = MIRBuilder.getMF();
237  uint64_t LocMemOffset = VA.getLocMemOffset();
238  const auto &ST = MF.getSubtarget<GCNSubtarget>();
239 
240  auto MMO = MF.getMachineMemOperand(
242  commonAlignment(ST.getStackAlignment(), LocMemOffset));
243  MIRBuilder.buildStore(ValVReg, Addr, *MMO);
244  }
245 
246  void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
247  uint64_t MemSize, MachinePointerInfo &MPO,
248  CCValAssign &VA) override {
249  Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
250  ? extendRegister(Arg.Regs[0], VA)
251  : Arg.Regs[0];
252 
253  // If we extended the value type we might need to adjust the MMO's
254  // Size. This happens if ComputeValueVTs widened a small type value to a
255  // legal register type (e.g. s8->s16)
256  const LLT RegTy = MRI.getType(ValVReg);
257  MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
258  assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
259  }
260 };
261 }
262 
264  : CallLowering(&TLI) {
265 }
266 
267 // FIXME: Compatability shim
268 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
269  switch (MIOpc) {
270  case TargetOpcode::G_SEXT:
271  return ISD::SIGN_EXTEND;
272  case TargetOpcode::G_ZEXT:
273  return ISD::ZERO_EXTEND;
274  case TargetOpcode::G_ANYEXT:
275  return ISD::ANY_EXTEND;
276  default:
277  llvm_unreachable("not an extend opcode");
278  }
279 }
280 
281 // FIXME: This should move to generic code.
282 void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B,
283  const ArgInfo &OrigArg,
284  SmallVectorImpl<ArgInfo> &SplitArgs,
285  const DataLayout &DL,
286  CallingConv::ID CallConv) const {
287  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
288  LLVMContext &Ctx = OrigArg.Ty->getContext();
289 
290  SmallVector<EVT, 4> SplitVTs;
291  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
292 
293  assert(OrigArg.Regs.size() == SplitVTs.size());
294 
295  if (SplitVTs.size() == 0)
296  return;
297 
298  if (SplitVTs.size() == 1) {
299  // No splitting to do, but we want to replace the original type (e.g. [1 x
300  // double] -> double).
301  SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
302  OrigArg.Flags[0], OrigArg.IsFixed);
303  return;
304  }
305 
306  // Create one ArgInfo for each virtual register in the original ArgInfo.
307  assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
308 
309  bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
310  OrigArg.Ty, CallConv, false);
311  for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
312  Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
313  SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
314  OrigArg.IsFixed);
315  if (NeedsRegBlock)
316  SplitArgs.back().Flags[0].setInConsecutiveRegs();
317  }
318 
319  SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
320 }
321 
322 void AMDGPUCallLowering::processSplitArgs(
323  MachineIRBuilder &B, const ArgInfo &OrigArg,
324  const SmallVectorImpl<ArgInfo> &SplitArg,
325  SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
326  CallingConv::ID CallConv, bool IsOutgoing,
327  SplitArgTy PerformArgSplit) const {
328  LLVMContext &Ctx = OrigArg.Ty->getContext();
329  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
330 
331  // FIXME: This is mostly nasty pre-processing before handleAssignments. Most
332  // of this should be performed by handleAssignments.
333 
334  for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
335  const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
336  Register Reg = OrigArg.Regs[SplitIdx];
337  EVT VT = EVT::getEVT(CurSplitArg.Ty);
338  LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
339 
340  unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
341  MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
342 
343  if (NumParts == 1) {
344  // No splitting to do, but we want to replace the original type (e.g. [1 x
345  // double] -> double).
346  SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
347  OrigArg.IsFixed);
348  continue;
349  }
350 
351  SmallVector<Register, 8> SplitRegs;
352  Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
353  LLT PartLLT = getLLTForType(*PartTy, DL);
354  MachineRegisterInfo &MRI = *B.getMRI();
355 
356  // FIXME: Should we be reporting all of the part registers for a single
357  // argument, and let handleAssignments take care of the repacking?
358  for (unsigned i = 0; i < NumParts; ++i) {
359  Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
360  SplitRegs.push_back(PartReg);
361  SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
362  }
363 
364  PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
365  }
366 }
367 
368 // TODO: Move to generic code
370  ArrayRef<Register> DstRegs,
371  Register SrcReg,
373  LLT SrcTy,
374  LLT PartTy) {
375  assert(DstRegs.size() > 1 && "Nothing to unpack");
376 
377  const unsigned PartSize = PartTy.getSizeInBits();
378 
379  if (SrcTy.isVector() && !PartTy.isVector() &&
380  PartSize > SrcTy.getElementType().getSizeInBits()) {
381  // Vector was scalarized, and the elements extended.
382  auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
383  for (int i = 0, e = DstRegs.size(); i != e; ++i)
384  B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
385  return;
386  }
387 
388  LLT GCDTy = getGCDType(SrcTy, PartTy);
389  if (GCDTy == PartTy) {
390  // If this already evenly divisible, we can create a simple unmerge.
391  B.buildUnmerge(DstRegs, SrcReg);
392  return;
393  }
394 
395  MachineRegisterInfo &MRI = *B.getMRI();
396  LLT DstTy = MRI.getType(DstRegs[0]);
397  LLT LCMTy = getLCMType(SrcTy, PartTy);
398 
399  const unsigned LCMSize = LCMTy.getSizeInBits();
400  const unsigned DstSize = DstTy.getSizeInBits();
401  const unsigned SrcSize = SrcTy.getSizeInBits();
402 
403  Register UnmergeSrc = SrcReg;
404  if (LCMSize != SrcSize) {
405  // Widen to the common type.
406  Register Undef = B.buildUndef(SrcTy).getReg(0);
407  SmallVector<Register, 8> MergeParts(1, SrcReg);
408  for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
409  MergeParts.push_back(Undef);
410 
411  UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
412  }
413 
414  // Unmerge to the original registers and pad with dead defs.
415  SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
416  for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
417  Size += DstSize) {
418  UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
419  }
420 
421  B.buildUnmerge(UnmergeResults, UnmergeSrc);
422 }
423 
424 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
425  CallingConv::ID CallConv,
427  bool IsVarArg) const {
428  // For shaders. Vector types should be explicitly handled by CC.
429  if (AMDGPU::isEntryFunctionCC(CallConv))
430  return true;
431 
433  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
434  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
435  MF.getFunction().getContext());
436 
437  return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
438 }
439 
440 /// Lower the return value for the already existing \p Ret. This assumes that
441 /// \p B's insertion point is correct.
442 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
443  const Value *Val, ArrayRef<Register> VRegs,
444  MachineInstrBuilder &Ret) const {
445  if (!Val)
446  return true;
447 
448  auto &MF = B.getMF();
449  const auto &F = MF.getFunction();
450  const DataLayout &DL = MF.getDataLayout();
451  MachineRegisterInfo *MRI = B.getMRI();
452  LLVMContext &Ctx = F.getContext();
453 
454  CallingConv::ID CC = F.getCallingConv();
455  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
456 
457  SmallVector<EVT, 8> SplitEVTs;
458  ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
459  assert(VRegs.size() == SplitEVTs.size() &&
460  "For each split Type there should be exactly one VReg.");
461 
462  // We pre-process the return value decomposed into EVTs.
463  SmallVector<ArgInfo, 8> PreSplitRetInfos;
464 
465  // Further processing is applied to split the arguments from PreSplitRetInfos
466  // into 32-bit pieces in SplitRetInfos before passing off to
467  // handleAssignments.
468  SmallVector<ArgInfo, 8> SplitRetInfos;
469 
470  for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
471  EVT VT = SplitEVTs[i];
472  Register Reg = VRegs[i];
473  ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
475 
476  if (VT.isScalarInteger()) {
477  unsigned ExtendOp = TargetOpcode::G_ANYEXT;
478  if (RetInfo.Flags[0].isSExt()) {
479  assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
480  ExtendOp = TargetOpcode::G_SEXT;
481  } else if (RetInfo.Flags[0].isZExt()) {
482  assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
483  ExtendOp = TargetOpcode::G_ZEXT;
484  }
485 
486  EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
487  extOpcodeToISDExtOpcode(ExtendOp));
488  if (ExtVT != VT) {
489  RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
490  LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
491  Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
492  }
493  }
494 
495  if (Reg != RetInfo.Regs[0]) {
496  RetInfo.Regs[0] = Reg;
497  // Reset the arg flags after modifying Reg.
499  }
500 
501  splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC);
502 
503  // FIXME: This splitting should mostly be done by handleAssignments
504  processSplitArgs(B, RetInfo,
505  PreSplitRetInfos, SplitRetInfos, DL, CC, true,
506  [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
507  LLT PartLLT, int VTSplitIdx) {
508  unpackRegsToOrigType(B, Regs, SrcReg,
509  PreSplitRetInfos[VTSplitIdx], LLTy,
510  PartLLT);
511  });
512  PreSplitRetInfos.clear();
513  }
514 
515  CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
516  AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
517  return handleAssignments(B, SplitRetInfos, RetHandler);
518 }
519 
521  ArrayRef<Register> VRegs,
522  FunctionLoweringInfo &FLI) const {
523 
524  MachineFunction &MF = B.getMF();
527  MFI->setIfReturnsVoid(!Val);
528 
529  assert(!Val == VRegs.empty() && "Return value without a vreg");
530 
531  CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
532  const bool IsShader = AMDGPU::isShader(CC);
533  const bool IsWaveEnd =
534  (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
535  if (IsWaveEnd) {
536  B.buildInstr(AMDGPU::S_ENDPGM)
537  .addImm(0);
538  return true;
539  }
540 
541  auto const &ST = MF.getSubtarget<GCNSubtarget>();
542 
543  unsigned ReturnOpc =
544  IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
545 
546  auto Ret = B.buildInstrNoInsert(ReturnOpc);
547  Register ReturnAddrVReg;
548  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
549  ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
550  Ret.addUse(ReturnAddrVReg);
551  }
552 
553  if (!FLI.CanLowerReturn)
554  insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
555  else if (!lowerReturnVal(B, Val, VRegs, Ret))
556  return false;
557 
558  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
559  const SIRegisterInfo *TRI = ST.getRegisterInfo();
560  Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
561  &AMDGPU::SGPR_64RegClass);
562  B.buildCopy(ReturnAddrVReg, LiveInReturn);
563  }
564 
565  // TODO: Handle CalleeSavedRegsViaCopy.
566 
567  B.insertInstr(Ret);
568  return true;
569 }
570 
571 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
572  Type *ParamTy,
573  uint64_t Offset) const {
574  MachineFunction &MF = B.getMF();
577  Register KernArgSegmentPtr =
579  Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
580 
581  auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
582 
583  B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
584 }
585 
586 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
587  uint64_t Offset, Align Alignment,
588  Register DstReg) const {
589  MachineFunction &MF = B.getMF();
590  const Function &F = MF.getFunction();
591  const DataLayout &DL = F.getParent()->getDataLayout();
593  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
594 
596  Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
597  lowerParameterPtr(PtrReg, B, ParamTy, Offset);
598 
600  PtrInfo,
603  TypeSize, Alignment);
604 
605  B.buildLoad(DstReg, PtrReg, *MMO);
606 }
607 
608 // Allocate special inputs passed in user SGPRs.
609 static void allocateHSAUserSGPRs(CCState &CCInfo,
611  MachineFunction &MF,
612  const SIRegisterInfo &TRI,
614  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
615  if (Info.hasPrivateSegmentBuffer()) {
616  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
617  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
618  CCInfo.AllocateReg(PrivateSegmentBufferReg);
619  }
620 
621  if (Info.hasDispatchPtr()) {
622  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
623  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
624  CCInfo.AllocateReg(DispatchPtrReg);
625  }
626 
627  if (Info.hasQueuePtr()) {
628  Register QueuePtrReg = Info.addQueuePtr(TRI);
629  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
630  CCInfo.AllocateReg(QueuePtrReg);
631  }
632 
633  if (Info.hasKernargSegmentPtr()) {
635  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
637  Register VReg = MRI.createGenericVirtualRegister(P4);
638  MRI.addLiveIn(InputPtrReg, VReg);
639  B.getMBB().addLiveIn(InputPtrReg);
640  B.buildCopy(VReg, InputPtrReg);
641  CCInfo.AllocateReg(InputPtrReg);
642  }
643 
644  if (Info.hasDispatchID()) {
645  Register DispatchIDReg = Info.addDispatchID(TRI);
646  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
647  CCInfo.AllocateReg(DispatchIDReg);
648  }
649 
650  if (Info.hasFlatScratchInit()) {
651  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
652  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
653  CCInfo.AllocateReg(FlatScratchInitReg);
654  }
655 
656  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
657  // these from the dispatch pointer.
658 }
659 
661  MachineIRBuilder &B, const Function &F,
662  ArrayRef<ArrayRef<Register>> VRegs) const {
663  MachineFunction &MF = B.getMF();
664  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
667  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
668  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
669 
670  const DataLayout &DL = F.getParent()->getDataLayout();
671 
673  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
674 
675  allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
676 
677  unsigned i = 0;
678  const Align KernArgBaseAlign(16);
679  const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
680  uint64_t ExplicitArgOffset = 0;
681 
682  // TODO: Align down to dword alignment and extract bits for extending loads.
683  for (auto &Arg : F.args()) {
684  const bool IsByRef = Arg.hasByRefAttr();
685  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
686  unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
687  if (AllocSize == 0)
688  continue;
689 
690  MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
691  if (!ABIAlign)
692  ABIAlign = DL.getABITypeAlign(ArgTy);
693 
694  uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
695  ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
696 
697  if (Arg.use_empty()) {
698  ++i;
699  continue;
700  }
701 
702  Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
703 
704  if (IsByRef) {
705  unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
706 
707  assert(VRegs[i].size() == 1 &&
708  "expected only one register for byval pointers");
709  if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
710  lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
711  } else {
712  const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
713  Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
714  lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
715 
716  B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
717  }
718  } else {
719  ArrayRef<Register> OrigArgRegs = VRegs[i];
720  Register ArgReg =
721  OrigArgRegs.size() == 1
722  ? OrigArgRegs[0]
723  : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
724 
725  lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
726  if (OrigArgRegs.size() > 1)
727  unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
728  }
729 
730  ++i;
731  }
732 
733  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
734  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
735  return true;
736 }
737 
738 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
741  MachineRegisterInfo &MRI = *B.getMRI();
742  LLT LLTy = MRI.getType(DstRegs[0]);
743  LLT PartLLT = MRI.getType(SrcRegs[0]);
744 
745  // Deal with v3s16 split into v2s16
746  LLT LCMTy = getLCMType(LLTy, PartLLT);
747  if (LCMTy == LLTy) {
748  // Common case where no padding is needed.
749  assert(DstRegs.size() == 1);
750  return B.buildConcatVectors(DstRegs[0], SrcRegs);
751  }
752 
753  const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
754  Register Undef = B.buildUndef(PartLLT).getReg(0);
755 
756  // Build vector of undefs.
757  SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
758 
759  // Replace the first sources with the real registers.
760  std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
761 
762  auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
763  int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
764 
765  SmallVector<Register, 8> PadDstRegs(NumDst);
766  std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
767 
768  // Create the excess dead defs for the unmerge.
769  for (int I = DstRegs.size(); I != NumDst; ++I)
770  PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
771 
772  return B.buildUnmerge(PadDstRegs, Widened);
773 }
774 
775 // TODO: Move this to generic code
777  ArrayRef<Register> OrigRegs,
778  ArrayRef<Register> Regs,
779  LLT LLTy,
780  LLT PartLLT) {
781  MachineRegisterInfo &MRI = *B.getMRI();
782 
783  if (!LLTy.isVector() && !PartLLT.isVector()) {
784  assert(OrigRegs.size() == 1);
785  LLT OrigTy = MRI.getType(OrigRegs[0]);
786 
787  unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
788  if (SrcSize == OrigTy.getSizeInBits())
789  B.buildMerge(OrigRegs[0], Regs);
790  else {
791  auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
792  B.buildTrunc(OrigRegs[0], Widened);
793  }
794 
795  return;
796  }
797 
798  if (LLTy.isVector() && PartLLT.isVector()) {
799  assert(OrigRegs.size() == 1);
800  assert(LLTy.getElementType() == PartLLT.getElementType());
801  mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
802  return;
803  }
804 
805  assert(LLTy.isVector() && !PartLLT.isVector());
806 
807  LLT DstEltTy = LLTy.getElementType();
808 
809  // Pointer information was discarded. We'll need to coerce some register types
810  // to avoid violating type constraints.
811  LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
812 
813  assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
814 
815  if (DstEltTy == PartLLT) {
816  // Vector was trivially scalarized.
817 
818  if (RealDstEltTy.isPointer()) {
819  for (Register Reg : Regs)
820  MRI.setType(Reg, RealDstEltTy);
821  }
822 
823  B.buildBuildVector(OrigRegs[0], Regs);
824  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
825  // Deal with vector with 64-bit elements decomposed to 32-bit
826  // registers. Need to create intermediate 64-bit elements.
827  SmallVector<Register, 8> EltMerges;
828  int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
829 
830  assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
831 
832  for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
833  auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
834  // Fix the type in case this is really a vector of pointers.
835  MRI.setType(Merge.getReg(0), RealDstEltTy);
836  EltMerges.push_back(Merge.getReg(0));
837  Regs = Regs.drop_front(PartsPerElt);
838  }
839 
840  B.buildBuildVector(OrigRegs[0], EltMerges);
841  } else {
842  // Vector was split, and elements promoted to a wider type.
843  LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
844  auto BV = B.buildBuildVector(BVType, Regs);
845  B.buildTrunc(OrigRegs[0], BV);
846  }
847 }
848 
851  FunctionLoweringInfo &FLI) const {
852  CallingConv::ID CC = F.getCallingConv();
853 
854  // The infrastructure for normal calling convention lowering is essentially
855  // useless for kernels. We want to avoid any kind of legalization or argument
856  // splitting.
857  if (CC == CallingConv::AMDGPU_KERNEL)
858  return lowerFormalArgumentsKernel(B, F, VRegs);
859 
860  const bool IsGraphics = AMDGPU::isGraphics(CC);
861  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
862 
863  MachineFunction &MF = B.getMF();
864  MachineBasicBlock &MBB = B.getMBB();
867  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
868  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
869  const DataLayout &DL = F.getParent()->getDataLayout();
870 
871 
873  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
874 
875  if (!IsEntryFunc) {
876  Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
877  Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
878  &AMDGPU::SGPR_64RegClass);
879  MBB.addLiveIn(ReturnAddrReg);
880  B.buildCopy(LiveInReturn, ReturnAddrReg);
881  }
882 
883  if (Info->hasImplicitBufferPtr()) {
884  Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
885  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
886  CCInfo.AllocateReg(ImplicitBufferPtrReg);
887  }
888 
889  SmallVector<ArgInfo, 8> SplitArg;
890  SmallVector<ArgInfo, 32> SplitArgs;
891  unsigned Idx = 0;
892  unsigned PSInputNum = 0;
893 
894  // Insert the hidden sret parameter if the return value won't fit in the
895  // return registers.
896  if (!FLI.CanLowerReturn)
897  insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
898 
899  for (auto &Arg : F.args()) {
900  if (DL.getTypeStoreSize(Arg.getType()) == 0)
901  continue;
902 
903  const bool InReg = Arg.hasAttribute(Attribute::InReg);
904 
905  // SGPR arguments to functions not implemented.
906  if (!IsGraphics && InReg)
907  return false;
908 
909  if (Arg.hasAttribute(Attribute::SwiftSelf) ||
910  Arg.hasAttribute(Attribute::SwiftError) ||
911  Arg.hasAttribute(Attribute::Nest))
912  return false;
913 
914  if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
915  const bool ArgUsed = !Arg.use_empty();
916  bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
917 
918  if (!SkipArg) {
919  Info->markPSInputAllocated(PSInputNum);
920  if (ArgUsed)
921  Info->markPSInputEnabled(PSInputNum);
922  }
923 
924  ++PSInputNum;
925 
926  if (SkipArg) {
927  for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
928  B.buildUndef(VRegs[Idx][I]);
929 
930  ++Idx;
931  continue;
932  }
933  }
934 
935  ArgInfo OrigArg(VRegs[Idx], Arg.getType());
936  const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
937  setArgFlags(OrigArg, OrigArgIdx, DL, F);
938 
939  SplitArg.clear();
940  splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
941 
942  processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
943  // FIXME: We should probably be passing multiple registers
944  // to handleAssignments to do this
945  [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
946  LLT PartLLT, int VTSplitIdx) {
947  assert(DstReg == VRegs[Idx][VTSplitIdx]);
948  packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
949  LLTy, PartLLT);
950  });
951 
952  ++Idx;
953  }
954 
955  // At least one interpolation mode must be enabled or else the GPU will
956  // hang.
957  //
958  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
959  // set PSInputAddr, the user wants to enable some bits after the compilation
960  // based on run-time states. Since we can't know what the final PSInputEna
961  // will look like, so we shouldn't do anything here and the user should take
962  // responsibility for the correct programming.
963  //
964  // Otherwise, the following restrictions apply:
965  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
966  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
967  // enabled too.
968  if (CC == CallingConv::AMDGPU_PS) {
969  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
970  ((Info->getPSInputAddr() & 0xF) == 0 &&
971  Info->isPSInputAllocated(11))) {
972  CCInfo.AllocateReg(AMDGPU::VGPR0);
973  CCInfo.AllocateReg(AMDGPU::VGPR1);
974  Info->markPSInputAllocated(0);
975  Info->markPSInputEnabled(0);
976  }
977 
978  if (Subtarget.isAmdPalOS()) {
979  // For isAmdPalOS, the user does not enable some bits after compilation
980  // based on run-time states; the register values being generated here are
981  // the final ones set in hardware. Therefore we need to apply the
982  // workaround to PSInputAddr and PSInputEnable together. (The case where
983  // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
984  // set up an input arg for a particular interpolation mode, but nothing
985  // uses that input arg. Really we should have an earlier pass that removes
986  // such an arg.)
987  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
988  if ((PsInputBits & 0x7F) == 0 ||
989  ((PsInputBits & 0xF) == 0 &&
990  (PsInputBits >> 11 & 1)))
991  Info->markPSInputEnabled(
992  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
993  }
994  }
995 
996  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
997  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
998 
999  if (!MBB.empty())
1000  B.setInstr(*MBB.begin());
1001 
1002  if (!IsEntryFunc) {
1003  // For the fixed ABI, pass workitem IDs in the last argument register.
1005  TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
1006  }
1007 
1008  FormalArgHandler Handler(B, MRI, AssignFn);
1009  if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
1010  return false;
1011 
1012  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
1013  // Special inputs come after user arguments.
1014  TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1015  }
1016 
1017  // Start adding system SGPRs.
1018  if (IsEntryFunc) {
1019  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
1020  } else {
1021  if (!Subtarget.enableFlatScratch())
1022  CCInfo.AllocateReg(Info->getScratchRSrcReg());
1023  TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1024  }
1025 
1026  // Move back to the end of the basic block.
1027  B.setMBB(MBB);
1028 
1029  return true;
1030 }
1031 
1033  CCState &CCInfo,
1034  SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
1035  CallLoweringInfo &Info) const {
1036  MachineFunction &MF = MIRBuilder.getMF();
1037 
1038  const AMDGPUFunctionArgInfo *CalleeArgInfo
1040 
1042  const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
1043 
1044 
1045  // TODO: Unify with private memory register handling. This is complicated by
1046  // the fact that at least in kernels, the input argument is not necessarily
1047  // in the same location as the input.
1056  };
1057 
1059 
1060  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1061  const AMDGPULegalizerInfo *LI
1062  = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
1063 
1064  for (auto InputID : InputRegs) {
1065  const ArgDescriptor *OutgoingArg;
1066  const TargetRegisterClass *ArgRC;
1067  LLT ArgTy;
1068 
1069  std::tie(OutgoingArg, ArgRC, ArgTy) =
1070  CalleeArgInfo->getPreloadedValue(InputID);
1071  if (!OutgoingArg)
1072  continue;
1073 
1074  const ArgDescriptor *IncomingArg;
1075  const TargetRegisterClass *IncomingArgRC;
1076  std::tie(IncomingArg, IncomingArgRC, ArgTy) =
1077  CallerArgInfo.getPreloadedValue(InputID);
1078  assert(IncomingArgRC == ArgRC);
1079 
1080  Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
1081 
1082  if (IncomingArg) {
1083  LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
1084  } else {
1086  LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
1087  }
1088 
1089  if (OutgoingArg->isRegister()) {
1090  ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1091  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1092  report_fatal_error("failed to allocate implicit input argument");
1093  } else {
1094  LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1095  return false;
1096  }
1097  }
1098 
1099  // Pack workitem IDs into a single register or pass it as is if already
1100  // packed.
1101  const ArgDescriptor *OutgoingArg;
1102  const TargetRegisterClass *ArgRC;
1103  LLT ArgTy;
1104 
1105  std::tie(OutgoingArg, ArgRC, ArgTy) =
1107  if (!OutgoingArg)
1108  std::tie(OutgoingArg, ArgRC, ArgTy) =
1110  if (!OutgoingArg)
1111  std::tie(OutgoingArg, ArgRC, ArgTy) =
1113  if (!OutgoingArg)
1114  return false;
1115 
1116  auto WorkitemIDX =
1117  CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1118  auto WorkitemIDY =
1119  CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1120  auto WorkitemIDZ =
1121  CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1122 
1123  const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
1124  const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
1125  const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
1126  const LLT S32 = LLT::scalar(32);
1127 
1128  // If incoming ids are not packed we need to pack them.
1129  // FIXME: Should consider known workgroup size to eliminate known 0 cases.
1130  Register InputReg;
1131  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
1132  InputReg = MRI.createGenericVirtualRegister(S32);
1133  LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
1134  std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
1135  }
1136 
1137  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
1138  Register Y = MRI.createGenericVirtualRegister(S32);
1139  LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
1140  std::get<2>(WorkitemIDY));
1141 
1142  Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
1143  InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
1144  }
1145 
1146  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
1147  Register Z = MRI.createGenericVirtualRegister(S32);
1148  LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
1149  std::get<2>(WorkitemIDZ));
1150 
1151  Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
1152  InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
1153  }
1154 
1155  if (!InputReg) {
1156  InputReg = MRI.createGenericVirtualRegister(S32);
1157 
1158  // Workitem ids are already packed, any of present incoming arguments will
1159  // carry all required fields.
1161  IncomingArgX ? *IncomingArgX :
1162  IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
1163  LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
1164  &AMDGPU::VGPR_32RegClass, S32);
1165  }
1166 
1167  if (OutgoingArg->isRegister()) {
1168  ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1169  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1170  report_fatal_error("failed to allocate implicit input argument");
1171  } else {
1172  LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1173  return false;
1174  }
1175 
1176  return true;
1177 }
1178 
1179 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1180 /// CC.
1181 static std::pair<CCAssignFn *, CCAssignFn *>
1183  return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
1184 }
1185 
1186 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1187  bool IsTailCall) {
1188  return AMDGPU::SI_CALL;
1189 }
1190 
1191 // Add operands to call instruction to track the callee.
1193  MachineIRBuilder &MIRBuilder,
1194  AMDGPUCallLowering::CallLoweringInfo &Info) {
1195  if (Info.Callee.isReg()) {
1196  CallInst.addReg(Info.Callee.getReg());
1197  CallInst.addImm(0);
1198  } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1199  // The call lowering lightly assumed we can directly encode a call target in
1200  // the instruction, which is not the case. Materialize the address here.
1201  const GlobalValue *GV = Info.Callee.getGlobal();
1202  auto Ptr = MIRBuilder.buildGlobalValue(
1203  LLT::pointer(GV->getAddressSpace(), 64), GV);
1204  CallInst.addReg(Ptr.getReg(0));
1205  CallInst.add(Info.Callee);
1206  } else
1207  return false;
1208 
1209  return true;
1210 }
1211 
1213  CallLoweringInfo &Info) const {
1214  if (Info.IsVarArg) {
1215  LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1216  return false;
1217  }
1218 
1219  MachineFunction &MF = MIRBuilder.getMF();
1220  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1221  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1222 
1223  const Function &F = MF.getFunction();
1225  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1226  const DataLayout &DL = F.getParent()->getDataLayout();
1227  CallingConv::ID CallConv = F.getCallingConv();
1228 
1230  CallConv != CallingConv::AMDGPU_Gfx) {
1231  LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1232  return false;
1233  }
1234 
1235  if (AMDGPU::isShader(CallConv)) {
1236  LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1237  return false;
1238  }
1239 
1240  SmallVector<ArgInfo, 8> OutArgs;
1241 
1242  SmallVector<ArgInfo, 8> SplitArg;
1243  for (auto &OrigArg : Info.OrigArgs) {
1244  splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv);
1245 
1246  processSplitArgs(
1247  MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
1248  // FIXME: We should probably be passing multiple registers to
1249  // handleAssignments to do this
1250  [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
1251  int VTSplitIdx) {
1252  unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
1253  });
1254 
1255  SplitArg.clear();
1256  }
1257 
1258  // If we can lower as a tail call, do that instead.
1259  bool CanTailCallOpt = false;
1260 
1261  // We must emit a tail call if we have musttail.
1262  if (Info.IsMustTailCall && !CanTailCallOpt) {
1263  LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1264  return false;
1265  }
1266 
1267  // Find out which ABI gets to decide where things go.
1268  CCAssignFn *AssignFnFixed;
1269  CCAssignFn *AssignFnVarArg;
1270  std::tie(AssignFnFixed, AssignFnVarArg) =
1271  getAssignFnsForCC(Info.CallConv, TLI);
1272 
1273  MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1274  .addImm(0)
1275  .addImm(0);
1276 
1277  // Create a temporarily-floating call instruction so we can add the implicit
1278  // uses of arg registers.
1279  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1280 
1281  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1282  MIB.addDef(TRI->getReturnAddressReg(MF));
1283 
1284  if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1285  return false;
1286 
1287  // Tell the call which registers are clobbered.
1288  const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1289  MIB.addRegMask(Mask);
1290 
1292  CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1293 
1294  // We could pass MIB and directly add the implicit uses to the call
1295  // now. However, as an aesthetic choice, place implicit argument operands
1296  // after the ordinary user argument registers.
1297  SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1298 
1300  // With a fixed ABI, allocate fixed registers before user arguments.
1301  if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1302  return false;
1303  }
1304 
1305  // Do the actual argument marshalling.
1306  SmallVector<Register, 8> PhysRegs;
1307  AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
1308  AssignFnVarArg, false);
1309  if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
1310  return false;
1311 
1313 
1314  if (!ST.enableFlatScratch()) {
1315  // Insert copies for the SRD. In the HSA case, this should be an identity
1316  // copy.
1317  auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
1318  MFI->getScratchRSrcReg());
1319  MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1320  MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1321  }
1322 
1323  for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1324  MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1325  MIB.addReg(ArgReg.first, RegState::Implicit);
1326  }
1327 
1328  // Get a count of how many bytes are to be pushed on the stack.
1329  unsigned NumBytes = CCInfo.getNextStackOffset();
1330 
1331  // If Callee is a reg, since it is used by a target specific
1332  // instruction, it must have a register class matching the
1333  // constraint of that instruction.
1334 
1335  // FIXME: We should define regbankselectable call instructions to handle
1336  // divergent call targets.
1337  if (MIB->getOperand(1).isReg()) {
1339  MF, *TRI, MRI, *ST.getInstrInfo(),
1340  *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1341  1));
1342  }
1343 
1344  auto OrigInsertPt = MIRBuilder.getInsertPt();
1345 
1346  // Now we can add the actual call instruction to the correct position.
1347  MIRBuilder.insertInstr(MIB);
1348 
1349  // Insert this now to give us an anchor point for managing the insert point.
1351  MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
1352 
1353  SmallVector<ArgInfo, 8> InArgs;
1354  if (!Info.CanLowerReturn) {
1355  insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1356  Info.DemoteRegister, Info.DemoteStackIndex);
1357  } else if (!Info.OrigRet.Ty->isVoidTy()) {
1358  SmallVector<ArgInfo, 8> PreSplitRetInfos;
1359 
1360  splitToValueTypes(
1361  MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
1362 
1363  processSplitArgs(MIRBuilder, Info.OrigRet,
1364  PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
1365  [&](ArrayRef<Register> Regs, Register DstReg,
1366  LLT LLTy, LLT PartLLT, int VTSplitIdx) {
1367  assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
1368  packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
1369  Regs, LLTy, PartLLT);
1370  });
1371  }
1372 
1373  // Make sure the raw argument copies are inserted before the marshalling to
1374  // the original types.
1375  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
1376 
1377  // Finally we can copy the returned value back into its virtual-register. In
1378  // symmetry with the arguments, the physical register must be an
1379  // implicit-define of the call instruction.
1380  if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1381  CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1382  Info.IsVarArg);
1383  CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
1384  if (!handleAssignments(MIRBuilder, InArgs, Handler))
1385  return false;
1386  }
1387 
1388  uint64_t CalleePopBytes = NumBytes;
1389  CallSeqEnd.addImm(0)
1390  .addImm(CalleePopBytes);
1391 
1392  // Restore the insert point to after the call sequence.
1393  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
1394  return true;
1395 }
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
bool isGraphics(CallingConv::ID cc)
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:111
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:856
Interface definition for SIRegisterInfo.
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:219
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
This class represents lattice values for constants.
Definition: AllocatorList.h:23
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs, FunctionLoweringInfo &FLI) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
Not emitted register (e.g. carry, or temporary result).
iterator begin() const
Definition: ArrayRef.h:144
Register getReg(unsigned Idx) const
Get the register for the operand index.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
static MachineInstrBuilder mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef< Register > DstRegs, ArrayRef< Register > SrcRegs)
Pack values SrcRegs to cover the vector type result DstRegs.
Address space for constant memory (VTX2).
Definition: AMDGPU.h:368
This file describes how to lower LLVM calls to machine code calls.
bool handleAssignments(MachineIRBuilder &MIRBuilder, SmallVectorImpl< ArgInfo > &Args, ValueHandler &Handler) const
Invoke Handler::assignArg on each of the given Args and then use Handler to move them to the assigned...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:141
MachineInstrBuilder buildPtrAdd(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert Res = G_PTR_ADD Op0, Op1.
This class represents a function call, abstracting a target machine's calling convention.
unsigned Reg
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_OR Op0, Op1.
unsigned const TargetRegisterInfo * TRI
F(f)
bool CanLowerReturn
CanLowerReturn - true iff the function's return value can be lowered to registers.
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, const MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:47
unsigned getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineBasicBlock & MBB
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: MathExtras.h:157
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
Value of the register doesn't matter.
bool isVector() const
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall)
A description of a memory reference used in the backend.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:178
The returned value is undefined.
Definition: MathExtras.h:47
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
uint64_t Addr
bool passSpecialInputs(MachineIRBuilder &MIRBuilder, CCState &CCInfo, SmallVectorImpl< std::pair< MCRegister, Register >> &ArgRegs, CallLoweringInfo &Info) const
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
The memory access is dereferenceable (i.e., doesn't trap).
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
static bool addCallTargetOperands(MachineInstrBuilder &CallInst, MachineIRBuilder &MIRBuilder, AMDGPUCallLowering::CallLoweringInfo &Info)
LocInfo getLocInfo() const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Align commonAlignment(Align A, Align B)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:221
Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO)
Definition: Utils.cpp:492
zlib style complession
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II)
Set the insertion point before the specified position.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
bool checkReturn(CCState &CCInfo, SmallVectorImpl< BaseArgInfo > &Outs, CCAssignFn *Fn) const
bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override
This hook must be implemented to lower the given call instruction, including argument and return valu...
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don't insert <empty> = Opcode <empty>.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
MachineFunction & getMF()
Getter for the function we currently build.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
void setReg(Register Reg)
Change the register this operand corresponds to.
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs, FunctionLoweringInfo &FLI) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs,...
Address space for private memory.
Definition: AMDGPU.h:370
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:120
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
MachineBasicBlock::iterator getInsertPt()
Current insertion point for new instructions.
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:156
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
void insertSRetIncomingArgument(const Function &F, SmallVectorImpl< ArgInfo > &SplitArgs, Register &DemoteReg, MachineRegisterInfo &MRI, const DataLayout &DL) const
Insert the hidden sret ArgInfo to the beginning of SplitArgs.
bool isEntryFunctionCC(CallingConv::ID CC)
Helper class to build MachineInstr.
static std::pair< CCAssignFn *, CCAssignFn * > getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI)
Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for CC.
MCRegister getRegister() const
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:339
unsigned getAddressSpace() const
Definition: Globals.cpp:112
MachineInstrBuilder buildGlobalValue(const DstOp &Res, const GlobalValue *GV)
Build and insert Res = G_GLOBAL_VALUE GV.
constexpr double e
Definition: MathExtras.h:58
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
AMDGPUFunctionArgInfo & getArgInfo()
The AMDGPU TargetMachine interface definition for hw codgen targets.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:296
Extended Value Type.
Definition: ValueTypes.h:35
virtual bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const
For some targets, an LLVM struct type must be broken down into multiple simple types,...
R600 Clause Merge
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
Argument handling is mostly uniform between the four places that make these decisions: function forma...
Definition: CallLowering.h:132
This class contains a discriminated union of information about pointers in memory operands,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
The memory access writes data.
void unpackRegs(ArrayRef< Register > DstRegs, Register SrcReg, Type *PackedTy, MachineIRBuilder &MIRBuilder) const
Generate instructions for unpacking SrcReg into the DstRegs corresponding to the aggregate type Packe...
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
LLVM_READNONE LLT getLCMType(LLT OrigTy, LLT TargetTy)
Return the least common multiple type of OrigTy and TargetTy, by changing the number of vector elemen...
Definition: Utils.cpp:617
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
uint64_t Offset
CCState - This class holds information needed while lowering arguments and return values.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1116
CCValAssign - Represent assignment of one arg/retval to a location.
iterator end() const
Definition: ArrayRef.h:145
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs) const
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
This class provides the information for the target register banks.
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
This file declares the MachineIRBuilder class.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool isShader(CallingConv::ID cc)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:678
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:681
bool isPointer() const
static void unpackRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > DstRegs, Register SrcReg, const CallLowering::ArgInfo &Info, LLT SrcTy, LLT PartTy)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
The memory access reads data.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:158
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned getLocMemOffset() const
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:195
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
#define I(x, y, z)
Definition: MD5.cpp:59
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:211
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg, int FI) const
Load the returned value from the stack into virtual registers in VRegs.
The memory access always returns the same value (or traps).
MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert G_STORE Val, Addr, MMO.
Calling convention used for AMD graphics targets.
Definition: CallingConv.h:245
static void packSplitRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > OrigRegs, ArrayRef< Register > Regs, LLT LLTy, LLT PartLLT)
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:522
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1479
static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc)
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
LLVM Value Representation.
Definition: Value.h:75
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Conversion operators.
Definition: ISDOpcodes.h:675
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:485
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1549
LLVM_READNONE LLT getGCDType(LLT OrigTy, LLT TargetTy)
Return a type where the total size is the greatest common divisor of OrigTy and TargetTy.
Definition: Utils.cpp:662
void insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg) const
Store the return value given by VRegs into stack starting at the offset specified in DemoteReg.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:151
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const SIRegisterInfo * getRegisterInfo() const override