LLVM  10.0.0svn
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
23 #include "llvm/CodeGen/Analysis.h"
28 
29 using namespace llvm;
30 
31 namespace {
32 
33 struct OutgoingValueHandler : public CallLowering::ValueHandler {
34  OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
35  MachineInstrBuilder MIB, CCAssignFn *AssignFn)
36  : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
37 
39 
40  bool isIncomingArgumentHandler() const override { return false; }
41 
42  Register getStackAddress(uint64_t Size, int64_t Offset,
43  MachinePointerInfo &MPO) override {
44  llvm_unreachable("not implemented");
45  }
46 
47  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
48  MachinePointerInfo &MPO, CCValAssign &VA) override {
49  llvm_unreachable("not implemented");
50  }
51 
52  void assignValueToReg(Register ValVReg, Register PhysReg,
53  CCValAssign &VA) override {
54  Register ExtReg;
55  if (VA.getLocVT().getSizeInBits() < 32) {
56  // 16-bit types are reported as legal for 32-bit registers. We need to
57  // extend and do a 32-bit copy to avoid the verifier complaining about it.
58  ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
59  } else
60  ExtReg = extendRegister(ValVReg, VA);
61 
62  MIRBuilder.buildCopy(PhysReg, ExtReg);
63  MIB.addUse(PhysReg, RegState::Implicit);
64  }
65 
66  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
67  CCValAssign::LocInfo LocInfo,
69  ISD::ArgFlagsTy Flags,
70  CCState &State) override {
71  return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
72  }
73 };
74 
75 struct IncomingArgHandler : public CallLowering::ValueHandler {
76  uint64_t StackUsed = 0;
77 
78  IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
79  CCAssignFn *AssignFn)
80  : ValueHandler(B, MRI, AssignFn) {}
81 
82  Register getStackAddress(uint64_t Size, int64_t Offset,
83  MachinePointerInfo &MPO) override {
84  auto &MFI = MIRBuilder.getMF().getFrameInfo();
85  int FI = MFI.CreateFixedObject(Size, Offset, true);
86  MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
87  Register AddrReg = MRI.createGenericVirtualRegister(
89  MIRBuilder.buildFrameIndex(AddrReg, FI);
90  StackUsed = std::max(StackUsed, Size + Offset);
91  return AddrReg;
92  }
93 
94  void assignValueToReg(Register ValVReg, Register PhysReg,
95  CCValAssign &VA) override {
96  markPhysRegUsed(PhysReg);
97 
98  if (VA.getLocVT().getSizeInBits() < 32) {
99  // 16-bit types are reported as legal for 32-bit registers. We need to do
100  // a 32-bit copy, and truncate to avoid the verifier complaining about it.
101  auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
102  MIRBuilder.buildTrunc(ValVReg, Copy);
103  return;
104  }
105 
106  switch (VA.getLocInfo()) {
107  case CCValAssign::LocInfo::SExt:
108  case CCValAssign::LocInfo::ZExt:
109  case CCValAssign::LocInfo::AExt: {
110  auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
111  MIRBuilder.buildTrunc(ValVReg, Copy);
112  break;
113  }
114  default:
115  MIRBuilder.buildCopy(ValVReg, PhysReg);
116  break;
117  }
118  }
119 
120  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
121  MachinePointerInfo &MPO, CCValAssign &VA) override {
122  // FIXME: Get alignment
123  auto MMO = MIRBuilder.getMF().getMachineMemOperand(
125  MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
126  }
127 
128  /// How the physical register gets marked varies between formal
129  /// parameters (it's a basic-block live-in), and a call instruction
130  /// (it's an implicit-def of the BL).
131  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
132 
133  // FIXME: What is the point of this being a callback?
134  bool isIncomingArgumentHandler() const override { return true; }
135 };
136 
137 struct FormalArgHandler : public IncomingArgHandler {
138  FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
139  CCAssignFn *AssignFn)
140  : IncomingArgHandler(B, MRI, AssignFn) {}
141 
142  void markPhysRegUsed(unsigned PhysReg) override {
143  MIRBuilder.getMBB().addLiveIn(PhysReg);
144  }
145 };
146 
147 }
148 
150  : CallLowering(&TLI) {
151 }
152 
153 void AMDGPUCallLowering::splitToValueTypes(
154  const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
155  const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
156  SplitArgTy PerformArgSplit) const {
157  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
158  LLVMContext &Ctx = OrigArg.Ty->getContext();
159 
160  if (OrigArg.Ty->isVoidTy())
161  return;
162 
163  SmallVector<EVT, 4> SplitVTs;
164  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
165 
166  assert(OrigArg.Regs.size() == SplitVTs.size());
167 
168  int SplitIdx = 0;
169  for (EVT VT : SplitVTs) {
170  unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
171  Type *Ty = VT.getTypeForEVT(Ctx);
172 
173 
174 
175  if (NumParts == 1) {
176  // No splitting to do, but we want to replace the original type (e.g. [1 x
177  // double] -> double).
178  SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
179  OrigArg.Flags, OrigArg.IsFixed);
180 
181  ++SplitIdx;
182  continue;
183  }
184 
185  LLT LLTy = getLLTForType(*Ty, DL);
186 
187  SmallVector<Register, 8> SplitRegs;
188 
189  EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
190  Type *PartTy = PartVT.getTypeForEVT(Ctx);
191  LLT PartLLT = getLLTForType(*PartTy, DL);
192 
193  // FIXME: Should we be reporting all of the part registers for a single
194  // argument, and let handleAssignments take care of the repacking?
195  for (unsigned i = 0; i < NumParts; ++i) {
196  Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
197  SplitRegs.push_back(PartReg);
198  SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
199  }
200 
201  PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
202 
203  ++SplitIdx;
204  }
205 }
206 
207 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
208 static LLT getMultipleType(LLT OrigTy, int Factor) {
209  if (OrigTy.isVector()) {
210  return LLT::vector(OrigTy.getNumElements() * Factor,
211  OrigTy.getElementType());
212  }
213 
214  return LLT::scalar(OrigTy.getSizeInBits() * Factor);
215 }
216 
217 // TODO: Move to generic code
219  ArrayRef<Register> DstRegs,
220  Register SrcReg,
221  LLT SrcTy,
222  LLT PartTy) {
223  assert(DstRegs.size() > 1 && "Nothing to unpack");
224 
225  MachineFunction &MF = B.getMF();
226  MachineRegisterInfo &MRI = MF.getRegInfo();
227 
228  const unsigned SrcSize = SrcTy.getSizeInBits();
229  const unsigned PartSize = PartTy.getSizeInBits();
230 
231  if (SrcTy.isVector() && !PartTy.isVector() &&
232  PartSize > SrcTy.getElementType().getSizeInBits()) {
233  // Vector was scalarized, and the elements extended.
234  auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
235  SrcReg);
236  for (int i = 0, e = DstRegs.size(); i != e; ++i)
237  B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
238  return;
239  }
240 
241  if (SrcSize % PartSize == 0) {
242  B.buildUnmerge(DstRegs, SrcReg);
243  return;
244  }
245 
246  const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
247 
248  LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
249  auto ImpDef = B.buildUndef(BigTy);
250 
251  Register BigReg = MRI.createGenericVirtualRegister(BigTy);
252  B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
253 
254  int64_t Offset = 0;
255  for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
256  B.buildExtract(DstRegs[i], BigReg, Offset);
257 }
258 
259 /// Lower the return value for the already existing \p Ret. This assumes that
260 /// \p B's insertion point is correct.
261 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
262  const Value *Val, ArrayRef<Register> VRegs,
263  MachineInstrBuilder &Ret) const {
264  if (!Val)
265  return true;
266 
267  auto &MF = B.getMF();
268  const auto &F = MF.getFunction();
269  const DataLayout &DL = MF.getDataLayout();
270 
271  CallingConv::ID CC = F.getCallingConv();
272  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
273  MachineRegisterInfo &MRI = MF.getRegInfo();
274 
275  ArgInfo OrigRetInfo(VRegs, Val->getType());
276  setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
277  SmallVector<ArgInfo, 4> SplitRetInfos;
278 
279  splitToValueTypes(
280  OrigRetInfo, SplitRetInfos, DL, MRI, CC,
281  [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
282  unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
283  });
284 
285  CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
286 
287  OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
288  return handleAssignments(B, SplitRetInfos, RetHandler);
289 }
290 
292  const Value *Val,
293  ArrayRef<Register> VRegs) const {
294 
295  MachineFunction &MF = B.getMF();
296  MachineRegisterInfo &MRI = MF.getRegInfo();
298  MFI->setIfReturnsVoid(!Val);
299 
300  assert(!Val == VRegs.empty() && "Return value without a vreg");
301 
303  const bool IsShader = AMDGPU::isShader(CC);
304  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
305  AMDGPU::isKernel(CC);
306  if (IsWaveEnd) {
307  B.buildInstr(AMDGPU::S_ENDPGM)
308  .addImm(0);
309  return true;
310  }
311 
312  auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
313 
314  unsigned ReturnOpc =
315  IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
316 
317  auto Ret = B.buildInstrNoInsert(ReturnOpc);
318  Register ReturnAddrVReg;
319  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
320  ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
321  Ret.addUse(ReturnAddrVReg);
322  }
323 
324  if (!lowerReturnVal(B, Val, VRegs, Ret))
325  return false;
326 
327  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
328  const SIRegisterInfo *TRI = ST.getRegisterInfo();
329  Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
330  &AMDGPU::SGPR_64RegClass);
331  B.buildCopy(ReturnAddrVReg, LiveInReturn);
332  }
333 
334  // TODO: Handle CalleeSavedRegsViaCopy.
335 
336  B.insertInstr(Ret);
337  return true;
338 }
339 
340 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
341  Type *ParamTy,
342  uint64_t Offset) const {
343 
344  MachineFunction &MF = B.getMF();
346  MachineRegisterInfo &MRI = MF.getRegInfo();
347  const Function &F = MF.getFunction();
348  const DataLayout &DL = F.getParent()->getDataLayout();
350  LLT PtrType = getLLTForType(*PtrTy, DL);
351  Register DstReg = MRI.createGenericVirtualRegister(PtrType);
352  Register KernArgSegmentPtr =
354  Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
355 
356  Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
357  B.buildConstant(OffsetReg, Offset);
358 
359  B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
360 
361  return DstReg;
362 }
363 
364 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
365  Type *ParamTy, uint64_t Offset,
366  unsigned Align,
367  Register DstReg) const {
368  MachineFunction &MF = B.getMF();
369  const Function &F = MF.getFunction();
370  const DataLayout &DL = F.getParent()->getDataLayout();
372  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
373  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
374  Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
375 
376  MachineMemOperand *MMO =
380  TypeSize, Align);
381 
382  B.buildLoad(DstReg, PtrReg, *MMO);
383 }
384 
385 // Allocate special inputs passed in user SGPRs.
386 static void allocateHSAUserSGPRs(CCState &CCInfo,
387  MachineIRBuilder &B,
388  MachineFunction &MF,
389  const SIRegisterInfo &TRI,
391  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
392  if (Info.hasPrivateSegmentBuffer()) {
393  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
394  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
395  CCInfo.AllocateReg(PrivateSegmentBufferReg);
396  }
397 
398  if (Info.hasDispatchPtr()) {
399  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
400  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
401  CCInfo.AllocateReg(DispatchPtrReg);
402  }
403 
404  if (Info.hasQueuePtr()) {
405  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
406  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
407  CCInfo.AllocateReg(QueuePtrReg);
408  }
409 
410  if (Info.hasKernargSegmentPtr()) {
411  MachineRegisterInfo &MRI = MF.getRegInfo();
412  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
414  Register VReg = MRI.createGenericVirtualRegister(P4);
415  MRI.addLiveIn(InputPtrReg, VReg);
416  B.getMBB().addLiveIn(InputPtrReg);
417  B.buildCopy(VReg, InputPtrReg);
418  CCInfo.AllocateReg(InputPtrReg);
419  }
420 
421  if (Info.hasDispatchID()) {
422  unsigned DispatchIDReg = Info.addDispatchID(TRI);
423  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
424  CCInfo.AllocateReg(DispatchIDReg);
425  }
426 
427  if (Info.hasFlatScratchInit()) {
428  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
429  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
430  CCInfo.AllocateReg(FlatScratchInitReg);
431  }
432 
433  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
434  // these from the dispatch pointer.
435 }
436 
438  MachineIRBuilder &B, const Function &F,
439  ArrayRef<ArrayRef<Register>> VRegs) const {
440  MachineFunction &MF = B.getMF();
441  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
442  MachineRegisterInfo &MRI = MF.getRegInfo();
444  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
445  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
446 
447  const DataLayout &DL = F.getParent()->getDataLayout();
448 
450  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
451 
452  allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
453 
454  unsigned i = 0;
455  const unsigned KernArgBaseAlign = 16;
456  const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
457  uint64_t ExplicitArgOffset = 0;
458 
459  // TODO: Align down to dword alignment and extract bits for extending loads.
460  for (auto &Arg : F.args()) {
461  Type *ArgTy = Arg.getType();
462  unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
463  if (AllocSize == 0)
464  continue;
465 
466  unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
467 
468  uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
469  ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
470 
471  ArrayRef<Register> OrigArgRegs = VRegs[i];
472  Register ArgReg =
473  OrigArgRegs.size() == 1
474  ? OrigArgRegs[0]
475  : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
476  unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
477  ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
478  lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
479  if (OrigArgRegs.size() > 1)
480  unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
481  ++i;
482  }
483 
484  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
485  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
486  return true;
487 }
488 
489 // TODO: Move this to generic code
491  ArrayRef<Register> OrigRegs,
492  ArrayRef<Register> Regs,
493  LLT LLTy,
494  LLT PartLLT) {
495  if (!LLTy.isVector() && !PartLLT.isVector()) {
496  B.buildMerge(OrigRegs[0], Regs);
497  return;
498  }
499 
500  if (LLTy.isVector() && PartLLT.isVector()) {
501  assert(LLTy.getElementType() == PartLLT.getElementType());
502 
503  int DstElts = LLTy.getNumElements();
504  int PartElts = PartLLT.getNumElements();
505  if (DstElts % PartElts == 0)
506  B.buildConcatVectors(OrigRegs[0], Regs);
507  else {
508  // Deal with v3s16 split into v2s16
509  assert(PartElts == 2 && DstElts % 2 != 0);
510  int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
511 
512  LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
513  auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
514  B.buildExtract(OrigRegs[0], RoundedConcat, 0);
515  }
516 
517  return;
518  }
519 
520  assert(LLTy.isVector() && !PartLLT.isVector());
521 
522  LLT DstEltTy = LLTy.getElementType();
523  if (DstEltTy == PartLLT) {
524  // Vector was trivially scalarized.
525  B.buildBuildVector(OrigRegs[0], Regs);
526  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
527  // Deal with vector with 64-bit elements decomposed to 32-bit
528  // registers. Need to create intermediate 64-bit elements.
529  SmallVector<Register, 8> EltMerges;
530  int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
531 
532  assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
533 
534  for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
535  auto Merge = B.buildMerge(DstEltTy,
536  Regs.take_front(PartsPerElt));
537  EltMerges.push_back(Merge.getReg(0));
538  Regs = Regs.drop_front(PartsPerElt);
539  }
540 
541  B.buildBuildVector(OrigRegs[0], EltMerges);
542  } else {
543  // Vector was split, and elements promoted to a wider type.
544  LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
545  auto BV = B.buildBuildVector(BVType, Regs);
546  B.buildTrunc(OrigRegs[0], BV);
547  }
548 }
549 
551  MachineIRBuilder &B, const Function &F,
552  ArrayRef<ArrayRef<Register>> VRegs) const {
554 
555  // The infrastructure for normal calling convention lowering is essentially
556  // useless for kernels. We want to avoid any kind of legalization or argument
557  // splitting.
558  if (CC == CallingConv::AMDGPU_KERNEL)
559  return lowerFormalArgumentsKernel(B, F, VRegs);
560 
561  const bool IsShader = AMDGPU::isShader(CC);
562  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
563 
564  MachineFunction &MF = B.getMF();
565  MachineBasicBlock &MBB = B.getMBB();
566  MachineRegisterInfo &MRI = MF.getRegInfo();
568  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
569  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
570  const DataLayout &DL = F.getParent()->getDataLayout();
571 
572 
574  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
575 
576  if (!IsEntryFunc) {
577  Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
578  Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
579  &AMDGPU::SGPR_64RegClass);
580  MBB.addLiveIn(ReturnAddrReg);
581  B.buildCopy(LiveInReturn, ReturnAddrReg);
582  }
583 
584  if (Info->hasImplicitBufferPtr()) {
585  Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
586  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
587  CCInfo.AllocateReg(ImplicitBufferPtrReg);
588  }
589 
590 
591  SmallVector<ArgInfo, 32> SplitArgs;
592  unsigned Idx = 0;
593  unsigned PSInputNum = 0;
594 
595  for (auto &Arg : F.args()) {
596  if (DL.getTypeStoreSize(Arg.getType()) == 0)
597  continue;
598 
599  const bool InReg = Arg.hasAttribute(Attribute::InReg);
600 
601  // SGPR arguments to functions not implemented.
602  if (!IsShader && InReg)
603  return false;
604 
605  if (Arg.hasAttribute(Attribute::SwiftSelf) ||
606  Arg.hasAttribute(Attribute::SwiftError) ||
607  Arg.hasAttribute(Attribute::Nest))
608  return false;
609 
610  if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
611  const bool ArgUsed = !Arg.use_empty();
612  bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
613 
614  if (!SkipArg) {
615  Info->markPSInputAllocated(PSInputNum);
616  if (ArgUsed)
617  Info->markPSInputEnabled(PSInputNum);
618  }
619 
620  ++PSInputNum;
621 
622  if (SkipArg) {
623  for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
624  B.buildUndef(VRegs[Idx][I]);
625 
626  ++Idx;
627  continue;
628  }
629  }
630 
631  ArgInfo OrigArg(VRegs[Idx], Arg.getType());
632  setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
633 
634  splitToValueTypes(
635  OrigArg, SplitArgs, DL, MRI, CC,
636  // FIXME: We should probably be passing multiple registers to
637  // handleAssignments to do this
638  [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
639  packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
640  LLTy, PartLLT);
641  });
642 
643  ++Idx;
644  }
645 
646  // At least one interpolation mode must be enabled or else the GPU will
647  // hang.
648  //
649  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
650  // set PSInputAddr, the user wants to enable some bits after the compilation
651  // based on run-time states. Since we can't know what the final PSInputEna
652  // will look like, so we shouldn't do anything here and the user should take
653  // responsibility for the correct programming.
654  //
655  // Otherwise, the following restrictions apply:
656  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
657  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
658  // enabled too.
659  if (CC == CallingConv::AMDGPU_PS) {
660  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
661  ((Info->getPSInputAddr() & 0xF) == 0 &&
662  Info->isPSInputAllocated(11))) {
663  CCInfo.AllocateReg(AMDGPU::VGPR0);
664  CCInfo.AllocateReg(AMDGPU::VGPR1);
665  Info->markPSInputAllocated(0);
666  Info->markPSInputEnabled(0);
667  }
668 
669  if (Subtarget.isAmdPalOS()) {
670  // For isAmdPalOS, the user does not enable some bits after compilation
671  // based on run-time states; the register values being generated here are
672  // the final ones set in hardware. Therefore we need to apply the
673  // workaround to PSInputAddr and PSInputEnable together. (The case where
674  // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
675  // set up an input arg for a particular interpolation mode, but nothing
676  // uses that input arg. Really we should have an earlier pass that removes
677  // such an arg.)
678  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
679  if ((PsInputBits & 0x7F) == 0 ||
680  ((PsInputBits & 0xF) == 0 &&
681  (PsInputBits >> 11 & 1)))
682  Info->markPSInputEnabled(
684  }
685  }
686 
687  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
688  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
689 
690  if (!MBB.empty())
691  B.setInstr(*MBB.begin());
692 
693  FormalArgHandler Handler(B, MRI, AssignFn);
694  if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
695  return false;
696 
697  if (!IsEntryFunc) {
698  // Special inputs come after user arguments.
699  TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
700  }
701 
702  // Start adding system SGPRs.
703  if (IsEntryFunc) {
704  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
705  } else {
706  CCInfo.AllocateReg(Info->getScratchRSrcReg());
707  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
708  CCInfo.AllocateReg(Info->getFrameOffsetReg());
709  TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
710  }
711 
712  // Move back to the end of the basic block.
713  B.setMBB(MBB);
714 
715  return true;
716 }
unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI)
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition: Function.h:176
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
Interface definition for SIRegisterInfo.
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:211
MachineInstrBuilder buildUnmerge(ArrayRef< LLT > Res, const SrcOp &Op)
Build and insert Res0, ...
AMDGPU specific subclass of TargetSubtarget.
bool isPSInputAllocated(unsigned Index) const
This class represents lattice values for constants.
Definition: AllocatorList.h:23
MachineInstrBuilder buildInsert(Register Res, Register Src, Register Op, unsigned Index)
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:199
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
void addLiveIn(unsigned Reg, unsigned vreg=0)
addLiveIn - Add the specified register as a live-in.
unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
This file describes how to lower LLVM calls to machine code calls.
bool handleAssignments(MachineIRBuilder &MIRBuilder, SmallVectorImpl< ArgInfo > &Args, ValueHandler &Handler) const
Invoke Handler::assignArg on each of the given Args and then use Callback to move them to the assigne...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI)
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:637
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change...
unsigned const TargetRegisterInfo * TRI
F(f)
void markPSInputEnabled(unsigned Index)
SmallVector< ISD::ArgFlagsTy, 4 > Flags
Definition: CallLowering.h:53
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
MachineInstrBuilder buildExtract(const DstOp &Res, const SrcOp &Src, uint64_t Index)
Build and insert `Res0, ...
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:156
bool isVector() const
A description of a memory reference used in the backend.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:264
unsigned addDispatchID(const SIRegisterInfo &TRI)
The returned value is undefined.
Definition: MathExtras.h:46
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
The memory access is dereferenceable (i.e., doesn&#39;t trap).
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
LocInfo getLocInfo() const
LLT getElementType() const
Returns the vector&#39;s element type. Only valid for vector types.
unsigned getSizeInBits() const
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:454
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don&#39;t insert <empty> = Opcode <empty>.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
MachineFunction & getMF()
Getter for the function we currently build.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
void markPSInputAllocated(unsigned Index)
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
Class to represent pointers.
Definition: DerivedTypes.h:579
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
bool isVoidTy() const
Return true if this is &#39;void&#39;.
Definition: Type.h:141
Address space for private memory.
Definition: AMDGPU.h:275
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:661
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
MachineInstrBuilder buildBuildVector(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_BUILD_VECTOR Op0, ...
unsigned getReturnAddressReg(const MachineFunction &MF) const
bool isEntryFunctionCC(CallingConv::ID CC)
Helper class to build MachineInstr.
SI DAG Lowering interface definition.
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
unsigned addQueuePtr(const SIRegisterInfo &TRI)
constexpr double e
Definition: MathExtras.h:57
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
MachineInstrBuilder buildGEP(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert Res = G_GEP Op0, Op1.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
Extended Value Type.
Definition: ValueTypes.h:33
R600 Clause Merge
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_TRUNC Op.
size_t size() const
Definition: SmallVector.h:52
Argument handling is mostly uniform between the four places that make these decisions: function forma...
Definition: CallLowering.h:112
This class contains a discriminated union of information about pointers in memory operands...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
unsigned addDispatchPtr(const SIRegisterInfo &TRI)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
void unpackRegs(ArrayRef< Register > DstRegs, Register SrcReg, Type *PackedTy, MachineIRBuilder &MIRBuilder) const
Generate instructions for unpacking SrcReg into the DstRegs corresponding to the aggregate type Packe...
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs...
MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert Res = G_LOAD Addr, MMO.
CCState - This class holds information needed while lowering arguments and return values...
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1146
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
Interface definition of the TargetLowering class that is common to all AMD GPUs.
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
unsigned getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses...
CCValAssign - Represent assignment of one arg/retval to a location.
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs) const
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLVM_READNONE bool isKernel(CallingConv::ID CC)
const Function & getFunction() const
Return the LLVM function that this machine code represents.
This file declares the MachineIRBuilder class.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool isShader(CallingConv::ID cc)
Address space for constant memory (VTX2).
Definition: AMDGPU.h:273
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
unsigned addFlatScratchInit(const SIRegisterInfo &TRI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Provides AMDGPU specific target descriptions.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:163
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:187
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setMBB(MachineBasicBlock &MBB)
Set the insertion point to the end of MBB.
#define I(x, y, z)
Definition: MD5.cpp:58
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
The memory access always returns the same value (or traps).
uint32_t Size
Definition: Profile.cpp:46
static void packSplitRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > OrigRegs, ArrayRef< Register > Regs, LLT LLTy, LLT PartLLT)
static void unpackRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > DstRegs, Register SrcReg, LLT SrcTy, LLT PartTy)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
static LLT getMultipleType(LLT OrigTy, int Factor)
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_MERGE_VALUES Op0, ...
SmallVector< Register, 4 > Regs
Definition: CallLowering.h:47
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
MachineInstrBuilder buildConcatVectors(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_CONCAT_VECTORS Op0, ...
unsigned AllocateReg(unsigned Reg)
AllocateReg - Attempt to allocate one register.
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:205
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
iterator_range< arg_iterator > args()
Definition: Function.h:724
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const SIRegisterInfo * getRegisterInfo() const override