LLVM  16.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
21 #include "llvm/ADT/Statistic.h"
24 #include "llvm/BinaryFormat/ELF.h"
25 #include "llvm/CodeGen/Analysis.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #include "llvm/Support/KnownBits.h"
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-lower"
42 
43 STATISTIC(NumTailCalls, "Number of tail calls");
44 
46  "amdgpu-disable-loop-alignment",
47  cl::desc("Do not align and prefetch loops"),
48  cl::init(false));
49 
51  "amdgpu-use-divergent-register-indexing",
52  cl::Hidden,
53  cl::desc("Use indirect register addressing for divergent indexes"),
54  cl::init(false));
55 
56 static bool hasFP32Denormals(const MachineFunction &MF) {
58  return Info->getMode().allFP32Denormals();
59 }
60 
61 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
63  return Info->getMode().allFP64FP16Denormals();
64 }
65 
66 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
67  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
68  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
69  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
70  return AMDGPU::SGPR0 + Reg;
71  }
72  }
73  llvm_unreachable("Cannot allocate sgpr");
74 }
75 
77  const GCNSubtarget &STI)
78  : AMDGPUTargetLowering(TM, STI),
79  Subtarget(&STI) {
80  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
81  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
82 
83  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
84  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
85 
86  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
87 
88  const SIRegisterInfo *TRI = STI.getRegisterInfo();
89  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
90 
91  addRegisterClass(MVT::f64, V64RegClass);
92  addRegisterClass(MVT::v2f32, V64RegClass);
93 
94  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
95  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
96 
97  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
98  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
99 
100  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
101  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
102 
103  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
104  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
105 
106  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
107  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
108 
109  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
110  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
111 
112  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
113  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
114 
115  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
116  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
117 
118  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
119  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
120 
121  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
122  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
123 
124  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
125  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
126 
127  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
128  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
129 
130  if (Subtarget->has16BitInsts()) {
131  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
132  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
133 
134  // Unless there are also VOP3P operations, not operations are really legal.
135  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
136  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
137  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
138  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
139  addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
140  addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
141  addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
142  addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
143  }
144 
145  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
146  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
147 
149 
150  // The boolean content concept here is too inflexible. Compares only ever
151  // really produce a 1-bit result. Any copy/extend from these will turn into a
152  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
153  // it's what most targets use.
156 
157  // We need to custom lower vector stores from local memory
161  MVT::v32i32},
162  Custom);
163 
167  MVT::v32i32},
168  Custom);
169 
186 
194 
196 
201 
204 
208 
212  Expand);
216  Expand);
217 
221  Custom);
222 
226 
228 
230 
232  Expand);
233 
234 #if 0
236 #endif
237 
238  // We only support LOAD/STORE and vector manipulation ops for vectors
239  // with > 4 elements.
240  for (MVT VT :
246  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
247  switch (Op) {
248  case ISD::LOAD:
249  case ISD::STORE:
250  case ISD::BUILD_VECTOR:
251  case ISD::BITCAST:
252  case ISD::UNDEF:
257  break;
259  case ISD::CONCAT_VECTORS:
261  break;
262  default:
264  break;
265  }
266  }
267  }
268 
270 
271  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
272  // is expanded to avoid having two separate loops in case the index is a VGPR.
273 
274  // Most operations are naturally 32-bit vector operations. We only support
275  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
276  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
279 
282 
285 
288  }
289 
290  for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
293 
296 
299 
302  }
303 
304  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
307 
310 
313 
316  }
317 
318  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
321 
324 
327 
330  }
331 
332  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
335 
338 
341 
344  }
345 
348  Expand);
349 
351 
352  // Avoid stack access for these.
353  // TODO: Generalize to more vector types.
357  Custom);
358 
359  // Deal with vec3 vector operations when widened to vec4.
362 
363  // Deal with vec5/6/7 vector operations when widened to vec8.
367  Custom);
368 
369  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
370  // and output demarshalling
372 
373  // We can't return success/failure, only the old value,
374  // let LLVM add the comparison
376  Expand);
377 
378  if (Subtarget->hasFlatAddressSpace())
380 
382 
383  // FIXME: This should be narrowed to i32, but that only happens if i64 is
384  // illegal.
385  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
387 
388  // On SI this is s_memtime and s_memrealtime on VI.
391 
392  if (Subtarget->has16BitInsts()) {
395  }
396 
397  if (Subtarget->hasMadMacF32Insts())
399 
400  if (!Subtarget->hasBFI())
401  // fcopysign can be done in a single instruction with BFI.
403 
404  if (!Subtarget->hasBCNT(32))
406 
407  if (!Subtarget->hasBCNT(64))
409 
410  if (Subtarget->hasFFBH())
412 
413  if (Subtarget->hasFFBL())
415 
416  // We only really have 32-bit BFE instructions (and 16-bit on VI).
417  //
418  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
419  // effort to match them now. We want this to be false for i64 cases when the
420  // extraction isn't restricted to the upper or lower half. Ideally we would
421  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
422  // span the midpoint are probably relatively rare, so don't worry about them
423  // for now.
424  if (Subtarget->hasBFE())
425  setHasExtractBitsInsn(true);
426 
427  // Clamp modifier on add/sub
428  if (Subtarget->hasIntClamp())
430 
431  if (Subtarget->hasAddNoCarry())
433  Legal);
434 
436  Custom);
437 
438  // These are really only legal for ieee_mode functions. We should be avoiding
439  // them for functions that don't have ieee_mode enabled, so just say they are
440  // legal.
442  {MVT::f32, MVT::f64}, Legal);
443 
444  if (Subtarget->haveRoundOpsF64())
446  else
448  MVT::f64, Custom);
449 
451 
454 
455  if (Subtarget->has16BitInsts()) {
458  MVT::i16, Legal);
459 
461 
463  MVT::i16, Expand);
464 
468  ISD::CTPOP},
469  MVT::i16, Promote);
470 
472 
474 
479 
481 
482  // F16 - Constant Actions.
484 
485  // F16 - Load/Store Actions.
490 
491  // F16 - VOP1 Actions.
494  MVT::f16, Custom);
495 
497 
500  MVT::f16, Promote);
501 
502  // F16 - VOP2 Actions.
504 
506 
507  // F16 - VOP3 Actions.
509  if (STI.hasMadF16())
511 
514  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
515  switch (Op) {
516  case ISD::LOAD:
517  case ISD::STORE:
518  case ISD::BUILD_VECTOR:
519  case ISD::BITCAST:
520  case ISD::UNDEF:
526  break;
527  case ISD::CONCAT_VECTORS:
529  break;
530  default:
532  break;
533  }
534  }
535  }
536 
537  // v_perm_b32 can handle either of these.
540 
541  // XXX - Do these do anything? Vector constants turn into build_vector.
543 
545 
550 
555 
562 
567 
572 
577 
582 
587 
592 
597 
599  MVT::v2i32, Expand);
601 
603  MVT::v4i32, Expand);
604 
606  MVT::v8i32, Expand);
607 
608  if (!Subtarget->hasVOP3PInsts())
610 
612  // This isn't really legal, but this avoids the legalizer unrolling it (and
613  // allows matching fneg (fabs x) patterns)
615 
618 
621 
624 
625  for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
628  Vec16, Custom);
630  }
631  }
632 
633  if (Subtarget->hasVOP3PInsts()) {
637  MVT::v2i16, Legal);
638 
641  MVT::v2f16, Legal);
642 
644  Custom);
645 
649  Custom);
650 
651  for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
652  // Split vector operations.
656  ISD::SSUBSAT},
657  VT, Custom);
658 
659  for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
660  // Split vector operations.
662  VT, Custom);
663 
665  Custom);
666 
669 
670  if (Subtarget->hasPackedFP32Ops()) {
672  MVT::v2f32, Legal);
675  Custom);
676  }
677  }
678 
680 
681  if (Subtarget->has16BitInsts()) {
686  } else {
687  // Legalization hack.
689 
691  }
692 
696  Custom);
697 
699 
700  if (Subtarget->hasMad64_32())
702 
706  Custom);
707 
711  MVT::i16, MVT::i8},
712  Custom);
713 
717  MVT::i8},
718  Custom);
719 
722  ISD::SUB,
724  ISD::FADD,
725  ISD::FSUB,
726  ISD::FMINNUM,
727  ISD::FMAXNUM,
730  ISD::FMA,
731  ISD::SMIN,
732  ISD::SMAX,
733  ISD::UMIN,
734  ISD::UMAX,
735  ISD::SETCC,
736  ISD::AND,
737  ISD::OR,
738  ISD::XOR,
747 
748  // All memory operations. Some folding on the pointer operand is done to help
749  // matching the constant offsets in the addressing modes.
751  ISD::STORE,
770 
771  // FIXME: In other contexts we pretend this is a per-function property.
772  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
773 
775 }
776 
778  return Subtarget;
779 }
780 
781 //===----------------------------------------------------------------------===//
782 // TargetLowering queries
783 //===----------------------------------------------------------------------===//
784 
785 // v_mad_mix* support a conversion from f16 to f32.
786 //
787 // There is only one special case when denormals are enabled we don't currently,
788 // where this is OK to use.
789 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
790  EVT DestVT, EVT SrcVT) const {
791  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
792  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
793  DestVT.getScalarType() == MVT::f32 &&
794  SrcVT.getScalarType() == MVT::f16 &&
795  // TODO: This probably only requires no input flushing?
797 }
798 
799 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
800  LLT DestTy, LLT SrcTy) const {
801  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
802  (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
803  DestTy.getScalarSizeInBits() == 32 &&
804  SrcTy.getScalarSizeInBits() == 16 &&
805  // TODO: This probably only requires no input flushing?
806  !hasFP32Denormals(*MI.getMF());
807 }
808 
810  // SI has some legal vector types, but no legal vector operations. Say no
811  // shuffles are legal in order to prefer scalarizing some vector operations.
812  return false;
813 }
814 
817  EVT VT) const {
820 
821  if (VT.isVector()) {
822  EVT ScalarVT = VT.getScalarType();
823  unsigned Size = ScalarVT.getSizeInBits();
824  if (Size == 16) {
825  if (Subtarget->has16BitInsts())
826  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
827  return VT.isInteger() ? MVT::i32 : MVT::f32;
828  }
829 
830  if (Size < 16)
831  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
832  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
833  }
834 
835  if (VT.getSizeInBits() > 32)
836  return MVT::i32;
837 
839 }
840 
843  EVT VT) const {
846 
847  if (VT.isVector()) {
848  unsigned NumElts = VT.getVectorNumElements();
849  EVT ScalarVT = VT.getScalarType();
850  unsigned Size = ScalarVT.getSizeInBits();
851 
852  // FIXME: Should probably promote 8-bit vectors to i16.
853  if (Size == 16 && Subtarget->has16BitInsts())
854  return (NumElts + 1) / 2;
855 
856  if (Size <= 32)
857  return NumElts;
858 
859  if (Size > 32)
860  return NumElts * ((Size + 31) / 32);
861  } else if (VT.getSizeInBits() > 32)
862  return (VT.getSizeInBits() + 31) / 32;
863 
865 }
866 
869  EVT VT, EVT &IntermediateVT,
870  unsigned &NumIntermediates, MVT &RegisterVT) const {
871  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
872  unsigned NumElts = VT.getVectorNumElements();
873  EVT ScalarVT = VT.getScalarType();
874  unsigned Size = ScalarVT.getSizeInBits();
875  // FIXME: We should fix the ABI to be the same on targets without 16-bit
876  // support, but unless we can properly handle 3-vectors, it will be still be
877  // inconsistent.
878  if (Size == 16 && Subtarget->has16BitInsts()) {
879  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
880  IntermediateVT = RegisterVT;
881  NumIntermediates = (NumElts + 1) / 2;
882  return NumIntermediates;
883  }
884 
885  if (Size == 32) {
886  RegisterVT = ScalarVT.getSimpleVT();
887  IntermediateVT = RegisterVT;
888  NumIntermediates = NumElts;
889  return NumIntermediates;
890  }
891 
892  if (Size < 16 && Subtarget->has16BitInsts()) {
893  // FIXME: Should probably form v2i16 pieces
894  RegisterVT = MVT::i16;
895  IntermediateVT = ScalarVT;
896  NumIntermediates = NumElts;
897  return NumIntermediates;
898  }
899 
900 
901  if (Size != 16 && Size <= 32) {
902  RegisterVT = MVT::i32;
903  IntermediateVT = ScalarVT;
904  NumIntermediates = NumElts;
905  return NumIntermediates;
906  }
907 
908  if (Size > 32) {
909  RegisterVT = MVT::i32;
910  IntermediateVT = RegisterVT;
911  NumIntermediates = NumElts * ((Size + 31) / 32);
912  return NumIntermediates;
913  }
914  }
915 
917  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
918 }
919 
920 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
921  assert(DMaskLanes != 0);
922 
923  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
924  unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
925  return EVT::getVectorVT(Ty->getContext(),
926  EVT::getEVT(VT->getElementType()),
927  NumElts);
928  }
929 
930  return EVT::getEVT(Ty);
931 }
932 
933 // Peek through TFE struct returns to only use the data size.
934 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
935  auto *ST = dyn_cast<StructType>(Ty);
936  if (!ST)
937  return memVTFromImageData(Ty, DMaskLanes);
938 
939  // Some intrinsics return an aggregate type - special case to work out the
940  // correct memVT.
941  //
942  // Only limited forms of aggregate type currently expected.
943  if (ST->getNumContainedTypes() != 2 ||
944  !ST->getContainedType(1)->isIntegerTy(32))
945  return EVT();
946  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
947 }
948 
950  const CallInst &CI,
951  MachineFunction &MF,
952  unsigned IntrID) const {
954  if (CI.hasMetadata(LLVMContext::MD_invariant_load))
956 
957  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
958  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
960  (Intrinsic::ID)IntrID);
961  if (Attr.hasFnAttr(Attribute::ReadNone))
962  return false;
963 
965 
966  const GCNTargetMachine &TM =
967  static_cast<const GCNTargetMachine &>(getTargetMachine());
968 
969  if (RsrcIntr->IsImage) {
970  Info.ptrVal = MFI->getImagePSV(TM);
971  Info.align.reset();
972  } else {
973  Info.ptrVal = MFI->getBufferPSV(TM);
974  }
975 
977  if (Attr.hasFnAttr(Attribute::ReadOnly)) {
978  unsigned DMaskLanes = 4;
979 
980  if (RsrcIntr->IsImage) {
983  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
984  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
985 
986  if (!BaseOpcode->Gather4) {
987  // If this isn't a gather, we may have excess loaded elements in the
988  // IR type. Check the dmask for the real number of elements loaded.
989  unsigned DMask
990  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
991  DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
992  }
993 
994  Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
995  } else
996  Info.memVT = EVT::getEVT(CI.getType());
997 
998  // FIXME: What does alignment mean for an image?
1001  } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
1002  Info.opc = ISD::INTRINSIC_VOID;
1003 
1004  Type *DataTy = CI.getArgOperand(0)->getType();
1005  if (RsrcIntr->IsImage) {
1006  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1007  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1008  Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1009  } else
1010  Info.memVT = EVT::getEVT(DataTy);
1011 
1013  } else {
1014  // Atomic
1015  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1017  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1018  Info.flags |= MachineMemOperand::MOLoad |
1021 
1022  // XXX - Should this be volatile without known ordering?
1024 
1025  switch (IntrID) {
1026  default:
1027  break;
1028  case Intrinsic::amdgcn_raw_buffer_load_lds:
1029  case Intrinsic::amdgcn_struct_buffer_load_lds: {
1030  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1031  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1032  return true;
1033  }
1034  }
1035  }
1036  return true;
1037  }
1038 
1039  switch (IntrID) {
1040  case Intrinsic::amdgcn_atomic_inc:
1041  case Intrinsic::amdgcn_atomic_dec:
1042  case Intrinsic::amdgcn_ds_ordered_add:
1043  case Intrinsic::amdgcn_ds_ordered_swap:
1044  case Intrinsic::amdgcn_ds_fadd:
1045  case Intrinsic::amdgcn_ds_fmin:
1046  case Intrinsic::amdgcn_ds_fmax: {
1048  Info.memVT = MVT::getVT(CI.getType());
1049  Info.ptrVal = CI.getOperand(0);
1050  Info.align.reset();
1052 
1053  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1054  if (!Vol->isZero())
1056 
1057  return true;
1058  }
1059  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1061 
1062  const GCNTargetMachine &TM =
1063  static_cast<const GCNTargetMachine &>(getTargetMachine());
1064 
1066  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1067  Info.ptrVal = MFI->getBufferPSV(TM);
1068  Info.align.reset();
1070 
1071  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1072  if (!Vol || !Vol->isZero())
1074 
1075  return true;
1076  }
1077  case Intrinsic::amdgcn_ds_append:
1078  case Intrinsic::amdgcn_ds_consume: {
1080  Info.memVT = MVT::getVT(CI.getType());
1081  Info.ptrVal = CI.getOperand(0);
1082  Info.align.reset();
1084 
1085  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1086  if (!Vol->isZero())
1088 
1089  return true;
1090  }
1091  case Intrinsic::amdgcn_global_atomic_csub: {
1093  Info.memVT = MVT::getVT(CI.getType());
1094  Info.ptrVal = CI.getOperand(0);
1095  Info.align.reset();
1096  Info.flags |= MachineMemOperand::MOLoad |
1099  return true;
1100  }
1101  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1104  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1105 
1106  const GCNTargetMachine &TM =
1107  static_cast<const GCNTargetMachine &>(getTargetMachine());
1108 
1109  Info.ptrVal = MFI->getImagePSV(TM);
1110  Info.align.reset();
1111  Info.flags |= MachineMemOperand::MOLoad |
1113  return true;
1114  }
1115  case Intrinsic::amdgcn_global_atomic_fadd:
1116  case Intrinsic::amdgcn_global_atomic_fmin:
1117  case Intrinsic::amdgcn_global_atomic_fmax:
1118  case Intrinsic::amdgcn_flat_atomic_fadd:
1119  case Intrinsic::amdgcn_flat_atomic_fmin:
1120  case Intrinsic::amdgcn_flat_atomic_fmax:
1121  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1122  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1124  Info.memVT = MVT::getVT(CI.getType());
1125  Info.ptrVal = CI.getOperand(0);
1126  Info.align.reset();
1127  Info.flags |= MachineMemOperand::MOLoad |
1131  return true;
1132  }
1133  case Intrinsic::amdgcn_ds_gws_init:
1134  case Intrinsic::amdgcn_ds_gws_barrier:
1135  case Intrinsic::amdgcn_ds_gws_sema_v:
1136  case Intrinsic::amdgcn_ds_gws_sema_br:
1137  case Intrinsic::amdgcn_ds_gws_sema_p:
1138  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1139  Info.opc = ISD::INTRINSIC_VOID;
1140 
1141  const GCNTargetMachine &TM =
1142  static_cast<const GCNTargetMachine &>(getTargetMachine());
1143 
1145  Info.ptrVal = MFI->getGWSPSV(TM);
1146 
1147  // This is an abstract access, but we need to specify a type and size.
1148  Info.memVT = MVT::i32;
1149  Info.size = 4;
1150  Info.align = Align(4);
1151 
1152  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1154  else
1156  return true;
1157  }
1158  case Intrinsic::amdgcn_global_load_lds: {
1159  Info.opc = ISD::INTRINSIC_VOID;
1160  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1161  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1164  return true;
1165  }
1166  case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1168 
1169  const GCNTargetMachine &TM =
1170  static_cast<const GCNTargetMachine &>(getTargetMachine());
1171 
1173  Info.ptrVal = MFI->getGWSPSV(TM);
1174 
1175  // This is an abstract access, but we need to specify a type and size.
1176  Info.memVT = MVT::i32;
1177  Info.size = 4;
1178  Info.align = Align(4);
1179 
1181  return true;
1182  }
1183  default:
1184  return false;
1185  }
1186 }
1187 
1190  Type *&AccessTy) const {
1191  switch (II->getIntrinsicID()) {
1192  case Intrinsic::amdgcn_atomic_inc:
1193  case Intrinsic::amdgcn_atomic_dec:
1194  case Intrinsic::amdgcn_ds_ordered_add:
1195  case Intrinsic::amdgcn_ds_ordered_swap:
1196  case Intrinsic::amdgcn_ds_append:
1197  case Intrinsic::amdgcn_ds_consume:
1198  case Intrinsic::amdgcn_ds_fadd:
1199  case Intrinsic::amdgcn_ds_fmin:
1200  case Intrinsic::amdgcn_ds_fmax:
1201  case Intrinsic::amdgcn_global_atomic_fadd:
1202  case Intrinsic::amdgcn_flat_atomic_fadd:
1203  case Intrinsic::amdgcn_flat_atomic_fmin:
1204  case Intrinsic::amdgcn_flat_atomic_fmax:
1205  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1206  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1207  case Intrinsic::amdgcn_global_atomic_csub: {
1208  Value *Ptr = II->getArgOperand(0);
1209  AccessTy = II->getType();
1210  Ops.push_back(Ptr);
1211  return true;
1212  }
1213  default:
1214  return false;
1215  }
1216 }
1217 
1218 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1219  if (!Subtarget->hasFlatInstOffsets()) {
1220  // Flat instructions do not have offsets, and only have the register
1221  // address.
1222  return AM.BaseOffs == 0 && AM.Scale == 0;
1223  }
1224 
1225  return AM.Scale == 0 &&
1226  (AM.BaseOffs == 0 ||
1227  Subtarget->getInstrInfo()->isLegalFLATOffset(
1229 }
1230 
1232  if (Subtarget->hasFlatGlobalInsts())
1233  return AM.Scale == 0 &&
1234  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1237 
1238  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1239  // Assume the we will use FLAT for all global memory accesses
1240  // on VI.
1241  // FIXME: This assumption is currently wrong. On VI we still use
1242  // MUBUF instructions for the r + i addressing mode. As currently
1243  // implemented, the MUBUF instructions only work on buffer < 4GB.
1244  // It may be possible to support > 4GB buffers with MUBUF instructions,
1245  // by setting the stride value in the resource descriptor which would
1246  // increase the size limit to (stride * 4GB). However, this is risky,
1247  // because it has never been validated.
1248  return isLegalFlatAddressingMode(AM);
1249  }
1250 
1251  return isLegalMUBUFAddressingMode(AM);
1252 }
1253 
1254 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1255  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1256  // additionally can do r + r + i with addr64. 32-bit has more addressing
1257  // mode options. Depending on the resource constant, it can also do
1258  // (i64 r0) + (i32 r1) * (i14 i).
1259  //
1260  // Private arrays end up using a scratch buffer most of the time, so also
1261  // assume those use MUBUF instructions. Scratch loads / stores are currently
1262  // implemented as mubuf instructions with offen bit set, so slightly
1263  // different than the normal addr64.
1264  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1265  return false;
1266 
1267  // FIXME: Since we can split immediate into soffset and immediate offset,
1268  // would it make sense to allow any immediate?
1269 
1270  switch (AM.Scale) {
1271  case 0: // r + i or just i, depending on HasBaseReg.
1272  return true;
1273  case 1:
1274  return true; // We have r + r or r + i.
1275  case 2:
1276  if (AM.HasBaseReg) {
1277  // Reject 2 * r + r.
1278  return false;
1279  }
1280 
1281  // Allow 2 * r as r + r
1282  // Or 2 * r + i is allowed as r + r + i.
1283  return true;
1284  default: // Don't allow n * r
1285  return false;
1286  }
1287 }
1288 
1290  const AddrMode &AM, Type *Ty,
1291  unsigned AS, Instruction *I) const {
1292  // No global is ever allowed as a base.
1293  if (AM.BaseGV)
1294  return false;
1295 
1296  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1297  return isLegalGlobalAddressingMode(AM);
1298 
1299  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1302  // If the offset isn't a multiple of 4, it probably isn't going to be
1303  // correctly aligned.
1304  // FIXME: Can we get the real alignment here?
1305  if (AM.BaseOffs % 4 != 0)
1306  return isLegalMUBUFAddressingMode(AM);
1307 
1308  // There are no SMRD extloads, so if we have to do a small type access we
1309  // will use a MUBUF load.
1310  // FIXME?: We also need to do this if unaligned, but we don't know the
1311  // alignment here.
1312  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1313  return isLegalGlobalAddressingMode(AM);
1314 
1315  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1316  // SMRD instructions have an 8-bit, dword offset on SI.
1317  if (!isUInt<8>(AM.BaseOffs / 4))
1318  return false;
1319  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1320  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1321  // in 8-bits, it can use a smaller encoding.
1322  if (!isUInt<32>(AM.BaseOffs / 4))
1323  return false;
1324  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1325  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1326  if (!isUInt<20>(AM.BaseOffs))
1327  return false;
1328  } else
1329  llvm_unreachable("unhandled generation");
1330 
1331  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1332  return true;
1333 
1334  if (AM.Scale == 1 && AM.HasBaseReg)
1335  return true;
1336 
1337  return false;
1338 
1339  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1340  return isLegalMUBUFAddressingMode(AM);
1341  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1342  AS == AMDGPUAS::REGION_ADDRESS) {
1343  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1344  // field.
1345  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1346  // an 8-bit dword offset but we don't know the alignment here.
1347  if (!isUInt<16>(AM.BaseOffs))
1348  return false;
1349 
1350  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1351  return true;
1352 
1353  if (AM.Scale == 1 && AM.HasBaseReg)
1354  return true;
1355 
1356  return false;
1357  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1359  // For an unknown address space, this usually means that this is for some
1360  // reason being used for pure arithmetic, and not based on some addressing
1361  // computation. We don't have instructions that compute pointers with any
1362  // addressing modes, so treat them as having no offset like flat
1363  // instructions.
1364  return isLegalFlatAddressingMode(AM);
1365  }
1366 
1367  // Assume a user alias of global for unknown address spaces.
1368  return isLegalGlobalAddressingMode(AM);
1369 }
1370 
1371 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1372  const MachineFunction &MF) const {
1373  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1374  return (MemVT.getSizeInBits() <= 4 * 32);
1375  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1376  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1377  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1378  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1379  return (MemVT.getSizeInBits() <= 2 * 32);
1380  }
1381  return true;
1382 }
1383 
1385  unsigned Size, unsigned AddrSpace, Align Alignment,
1386  MachineMemOperand::Flags Flags, bool *IsFast) const {
1387  if (IsFast)
1388  *IsFast = false;
1389 
1390  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1391  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1392  // Check if alignment requirements for ds_read/write instructions are
1393  // disabled.
1394  if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1395  return false;
1396 
1397  Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1398  if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1399  Alignment < RequiredAlignment)
1400  return false;
1401 
1402  // Either, the alignment requirements are "enabled", or there is an
1403  // unaligned LDS access related hardware bug though alignment requirements
1404  // are "disabled". In either case, we need to check for proper alignment
1405  // requirements.
1406  //
1407  switch (Size) {
1408  case 64:
1409  // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1410  // address is negative, then the instruction is incorrectly treated as
1411  // out-of-bounds even if base + offsets is in bounds. Split vectorized
1412  // loads here to avoid emitting ds_read2_b32. We may re-combine the
1413  // load later in the SILoadStoreOptimizer.
1414  if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1415  return false;
1416 
1417  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1418  // can do a 4 byte aligned, 8 byte access in a single operation using
1419  // ds_read2/write2_b32 with adjacent offsets.
1420  RequiredAlignment = Align(4);
1421 
1422  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1423  // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1424  // ds_write2_b32 depending on the alignment. In either case with either
1425  // alignment there is no faster way of doing this.
1426  if (IsFast)
1427  *IsFast = true;
1428  return true;
1429  }
1430 
1431  break;
1432  case 96:
1433  if (!Subtarget->hasDS96AndDS128())
1434  return false;
1435 
1436  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1437  // gfx8 and older.
1438 
1439  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1440  // Naturally aligned access is fastest. However, also report it is Fast
1441  // if memory is aligned less than DWORD. A narrow load or store will be
1442  // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1443  // be more of them, so overall we will pay less penalty issuing a single
1444  // instruction.
1445  if (IsFast)
1446  *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1447  return true;
1448  }
1449 
1450  break;
1451  case 128:
1452  if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1453  return false;
1454 
1455  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1456  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1457  // single operation using ds_read2/write2_b64.
1458  RequiredAlignment = Align(8);
1459 
1460  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1461  // Naturally aligned access is fastest. However, also report it is Fast
1462  // if memory is aligned less than DWORD. A narrow load or store will be
1463  // be equally slow as a single ds_read_b128/ds_write_b128, but there
1464  // will be more of them, so overall we will pay less penalty issuing a
1465  // single instruction.
1466  if (IsFast)
1467  *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1468  return true;
1469  }
1470 
1471  break;
1472  default:
1473  if (Size > 32)
1474  return false;
1475 
1476  break;
1477  }
1478 
1479  if (IsFast)
1480  *IsFast = Alignment >= RequiredAlignment;
1481 
1482  return Alignment >= RequiredAlignment ||
1483  Subtarget->hasUnalignedDSAccessEnabled();
1484  }
1485 
1486  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1487  bool AlignedBy4 = Alignment >= Align(4);
1488  if (IsFast)
1489  *IsFast = AlignedBy4;
1490 
1491  return AlignedBy4 ||
1492  Subtarget->enableFlatScratch() ||
1493  Subtarget->hasUnalignedScratchAccess();
1494  }
1495 
1496  // FIXME: We have to be conservative here and assume that flat operations
1497  // will access scratch. If we had access to the IR function, then we
1498  // could determine if any private memory was used in the function.
1499  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1500  !Subtarget->hasUnalignedScratchAccess()) {
1501  bool AlignedBy4 = Alignment >= Align(4);
1502  if (IsFast)
1503  *IsFast = AlignedBy4;
1504 
1505  return AlignedBy4;
1506  }
1507 
1508  if (Subtarget->hasUnalignedBufferAccessEnabled()) {
1509  // If we have a uniform constant load, it still requires using a slow
1510  // buffer instruction if unaligned.
1511  if (IsFast) {
1512  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1513  // 2-byte alignment is worse than 1 unless doing a 2-byte access.
1514  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1515  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1516  Alignment >= Align(4) : Alignment != Align(2);
1517  }
1518 
1519  return true;
1520  }
1521 
1522  // Smaller than dword value must be aligned.
1523  if (Size < 32)
1524  return false;
1525 
1526  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1527  // byte-address are ignored, thus forcing Dword alignment.
1528  // This applies to private, global, and constant memory.
1529  if (IsFast)
1530  *IsFast = true;
1531 
1532  return Size >= 32 && Alignment >= Align(4);
1533 }
1534 
1536  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1537  bool *IsFast) const {
1539  Alignment, Flags, IsFast);
1540 
1541  if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
1542  (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1543  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1544  // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
1545  // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
1546  // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
1547  // which would be equally misaligned.
1548  // This is only used by the common passes, selection always calls the
1549  // allowsMisalignedMemoryAccessesImpl version.
1550  *IsFast = true;
1551  }
1552 
1553  return Allow;
1554 }
1555 
1557  const MemOp &Op, const AttributeList &FuncAttributes) const {
1558  // FIXME: Should account for address space here.
1559 
1560  // The default fallback uses the private pointer size as a guess for a type to
1561  // use. Make sure we switch these to 64-bit accesses.
1562 
1563  if (Op.size() >= 16 &&
1564  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1565  return MVT::v4i32;
1566 
1567  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1568  return MVT::v2i32;
1569 
1570  // Use the default.
1571  return MVT::Other;
1572 }
1573 
1575  const MemSDNode *MemNode = cast<MemSDNode>(N);
1576  return MemNode->getMemOperand()->getFlags() & MONoClobber;
1577 }
1578 
1580  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1582 }
1583 
1585  unsigned DestAS) const {
1586  // Flat -> private/local is a simple truncate.
1587  // Flat -> global is no-op
1588  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1589  return true;
1590 
1591  const GCNTargetMachine &TM =
1592  static_cast<const GCNTargetMachine &>(getTargetMachine());
1593  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1594 }
1595 
1597  const MemSDNode *MemNode = cast<MemSDNode>(N);
1598 
1599  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1600 }
1601 
1604  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1608 }
1609 
1611  Type *Ty) const {
1612  // FIXME: Could be smarter if called for vector constants.
1613  return true;
1614 }
1615 
1617  unsigned Index) const {
1619  return false;
1620 
1621  // TODO: Add more cases that are cheap.
1622  return Index == 0;
1623 }
1624 
1626  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1627  switch (Op) {
1628  case ISD::LOAD:
1629  case ISD::STORE:
1630 
1631  // These operations are done with 32-bit instructions anyway.
1632  case ISD::AND:
1633  case ISD::OR:
1634  case ISD::XOR:
1635  case ISD::SELECT:
1636  // TODO: Extensions?
1637  return true;
1638  default:
1639  return false;
1640  }
1641  }
1642 
1643  // SimplifySetCC uses this function to determine whether or not it should
1644  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1645  if (VT == MVT::i1 && Op == ISD::SETCC)
1646  return false;
1647 
1649 }
1650 
1651 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1652  const SDLoc &SL,
1653  SDValue Chain,
1654  uint64_t Offset) const {
1655  const DataLayout &DL = DAG.getDataLayout();
1656  MachineFunction &MF = DAG.getMachineFunction();
1658 
1659  const ArgDescriptor *InputPtrReg;
1660  const TargetRegisterClass *RC;
1661  LLT ArgTy;
1663 
1664  std::tie(InputPtrReg, RC, ArgTy) =
1666 
1667  // We may not have the kernarg segment argument if we have no kernel
1668  // arguments.
1669  if (!InputPtrReg)
1670  return DAG.getConstant(0, SL, PtrVT);
1671 
1673  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1674  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1675 
1676  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1677 }
1678 
1679 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1680  const SDLoc &SL) const {
1682  FIRST_IMPLICIT);
1683  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1684 }
1685 
1686 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1687  const SDLoc &SL) const {
1688 
1690  Optional<uint32_t> KnownSize =
1692  if (KnownSize.has_value())
1693  return DAG.getConstant(KnownSize.value(), SL, MVT::i32);
1694  return SDValue();
1695 }
1696 
1697 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1698  const SDLoc &SL, SDValue Val,
1699  bool Signed,
1700  const ISD::InputArg *Arg) const {
1701  // First, if it is a widened vector, narrow it.
1702  if (VT.isVector() &&
1703  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1704  EVT NarrowedVT =
1706  VT.getVectorNumElements());
1707  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1708  DAG.getConstant(0, SL, MVT::i32));
1709  }
1710 
1711  // Then convert the vector elements or scalar value.
1712  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1713  VT.bitsLT(MemVT)) {
1714  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1715  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1716  }
1717 
1718  if (MemVT.isFloatingPoint())
1719  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1720  else if (Signed)
1721  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1722  else
1723  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1724 
1725  return Val;
1726 }
1727 
1728 SDValue SITargetLowering::lowerKernargMemParameter(
1729  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1730  uint64_t Offset, Align Alignment, bool Signed,
1731  const ISD::InputArg *Arg) const {
1733 
1734  // Try to avoid using an extload by loading earlier than the argument address,
1735  // and extracting the relevant bits. The load should hopefully be merged with
1736  // the previous argument.
1737  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1738  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1739  int64_t AlignDownOffset = alignDown(Offset, 4);
1740  int64_t OffsetDiff = Offset - AlignDownOffset;
1741 
1742  EVT IntVT = MemVT.changeTypeToInteger();
1743 
1744  // TODO: If we passed in the base kernel offset we could have a better
1745  // alignment than 4, but we don't really need it.
1746  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1747  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1750 
1751  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1752  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1753 
1754  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1755  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1756  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1757 
1758 
1759  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1760  }
1761 
1762  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1763  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1766 
1767  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1768  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1769 }
1770 
1771 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1772  const SDLoc &SL, SDValue Chain,
1773  const ISD::InputArg &Arg) const {
1774  MachineFunction &MF = DAG.getMachineFunction();
1775  MachineFrameInfo &MFI = MF.getFrameInfo();
1776 
1777  if (Arg.Flags.isByVal()) {
1778  unsigned Size = Arg.Flags.getByValSize();
1779  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1780  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1781  }
1782 
1783  unsigned ArgOffset = VA.getLocMemOffset();
1784  unsigned ArgSize = VA.getValVT().getStoreSize();
1785 
1786  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1787 
1788  // Create load nodes to retrieve arguments from the stack.
1789  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1790  SDValue ArgValue;
1791 
1792  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1794  MVT MemVT = VA.getValVT();
1795 
1796  switch (VA.getLocInfo()) {
1797  default:
1798  break;
1799  case CCValAssign::BCvt:
1800  MemVT = VA.getLocVT();
1801  break;
1802  case CCValAssign::SExt:
1803  ExtType = ISD::SEXTLOAD;
1804  break;
1805  case CCValAssign::ZExt:
1806  ExtType = ISD::ZEXTLOAD;
1807  break;
1808  case CCValAssign::AExt:
1809  ExtType = ISD::EXTLOAD;
1810  break;
1811  }
1812 
1813  ArgValue = DAG.getExtLoad(
1814  ExtType, SL, VA.getLocVT(), Chain, FIN,
1816  MemVT);
1817  return ArgValue;
1818 }
1819 
1820 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1821  const SIMachineFunctionInfo &MFI,
1822  EVT VT,
1824  const ArgDescriptor *Reg;
1825  const TargetRegisterClass *RC;
1826  LLT Ty;
1827 
1828  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1829  if (!Reg) {
1830  if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1831  // It's possible for a kernarg intrinsic call to appear in a kernel with
1832  // no allocated segment, in which case we do not add the user sgpr
1833  // argument, so just return null.
1834  return DAG.getConstant(0, SDLoc(), VT);
1835  }
1836 
1837  // It's undefined behavior if a function marked with the amdgpu-no-*
1838  // attributes uses the corresponding intrinsic.
1839  return DAG.getUNDEF(VT);
1840  }
1841 
1842  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1843 }
1844 
1846  CallingConv::ID CallConv,
1848  FunctionType *FType,
1850  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1851  const ISD::InputArg *Arg = &Ins[I];
1852 
1853  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1854  "vector type argument should have been split");
1855 
1856  // First check if it's a PS input addr.
1857  if (CallConv == CallingConv::AMDGPU_PS &&
1858  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1859  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1860 
1861  // Inconveniently only the first part of the split is marked as isSplit,
1862  // so skip to the end. We only want to increment PSInputNum once for the
1863  // entire split argument.
1864  if (Arg->Flags.isSplit()) {
1865  while (!Arg->Flags.isSplitEnd()) {
1866  assert((!Arg->VT.isVector() ||
1867  Arg->VT.getScalarSizeInBits() == 16) &&
1868  "unexpected vector split in ps argument type");
1869  if (!SkipArg)
1870  Splits.push_back(*Arg);
1871  Arg = &Ins[++I];
1872  }
1873  }
1874 
1875  if (SkipArg) {
1876  // We can safely skip PS inputs.
1877  Skipped.set(Arg->getOrigArgIndex());
1878  ++PSInputNum;
1879  continue;
1880  }
1881 
1882  Info->markPSInputAllocated(PSInputNum);
1883  if (Arg->Used)
1884  Info->markPSInputEnabled(PSInputNum);
1885 
1886  ++PSInputNum;
1887  }
1888 
1889  Splits.push_back(*Arg);
1890  }
1891 }
1892 
1893 // Allocate special inputs passed in VGPRs.
1895  MachineFunction &MF,
1896  const SIRegisterInfo &TRI,
1897  SIMachineFunctionInfo &Info) const {
1898  const LLT S32 = LLT::scalar(32);
1900 
1901  if (Info.hasWorkItemIDX()) {
1902  Register Reg = AMDGPU::VGPR0;
1903  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1904 
1905  CCInfo.AllocateReg(Reg);
1906  unsigned Mask = (Subtarget->hasPackedTID() &&
1907  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1908  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1909  }
1910 
1911  if (Info.hasWorkItemIDY()) {
1912  assert(Info.hasWorkItemIDX());
1913  if (Subtarget->hasPackedTID()) {
1914  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1915  0x3ff << 10));
1916  } else {
1917  unsigned Reg = AMDGPU::VGPR1;
1918  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1919 
1920  CCInfo.AllocateReg(Reg);
1921  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1922  }
1923  }
1924 
1925  if (Info.hasWorkItemIDZ()) {
1926  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1927  if (Subtarget->hasPackedTID()) {
1928  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1929  0x3ff << 20));
1930  } else {
1931  unsigned Reg = AMDGPU::VGPR2;
1932  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1933 
1934  CCInfo.AllocateReg(Reg);
1935  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1936  }
1937  }
1938 }
1939 
1940 // Try to allocate a VGPR at the end of the argument list, or if no argument
1941 // VGPRs are left allocating a stack slot.
1942 // If \p Mask is is given it indicates bitfield position in the register.
1943 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1944 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1946  if (Arg.isSet())
1948 
1949  ArrayRef<MCPhysReg> ArgVGPRs
1950  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1951  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1952  if (RegIdx == ArgVGPRs.size()) {
1953  // Spill to stack required.
1954  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1955 
1956  return ArgDescriptor::createStack(Offset, Mask);
1957  }
1958 
1959  unsigned Reg = ArgVGPRs[RegIdx];
1960  Reg = CCInfo.AllocateReg(Reg);
1961  assert(Reg != AMDGPU::NoRegister);
1962 
1963  MachineFunction &MF = CCInfo.getMachineFunction();
1964  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1965  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1967 }
1968 
1970  const TargetRegisterClass *RC,
1971  unsigned NumArgRegs) {
1972  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1973  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1974  if (RegIdx == ArgSGPRs.size())
1975  report_fatal_error("ran out of SGPRs for arguments");
1976 
1977  unsigned Reg = ArgSGPRs[RegIdx];
1978  Reg = CCInfo.AllocateReg(Reg);
1979  assert(Reg != AMDGPU::NoRegister);
1980 
1981  MachineFunction &MF = CCInfo.getMachineFunction();
1982  MF.addLiveIn(Reg, RC);
1984 }
1985 
1986 // If this has a fixed position, we still should allocate the register in the
1987 // CCInfo state. Technically we could get away with this for values passed
1988 // outside of the normal argument range.
1990  const TargetRegisterClass *RC,
1991  MCRegister Reg) {
1992  Reg = CCInfo.AllocateReg(Reg);
1993  assert(Reg != AMDGPU::NoRegister);
1994  MachineFunction &MF = CCInfo.getMachineFunction();
1995  MF.addLiveIn(Reg, RC);
1996 }
1997 
1999  if (Arg) {
2000  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2001  Arg.getRegister());
2002  } else
2003  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2004 }
2005 
2007  if (Arg) {
2008  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2009  Arg.getRegister());
2010  } else
2011  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2012 }
2013 
2014 /// Allocate implicit function VGPR arguments at the end of allocated user
2015 /// arguments.
2017  CCState &CCInfo, MachineFunction &MF,
2018  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2019  const unsigned Mask = 0x3ff;
2021 
2022  if (Info.hasWorkItemIDX()) {
2023  Arg = allocateVGPR32Input(CCInfo, Mask);
2024  Info.setWorkItemIDX(Arg);
2025  }
2026 
2027  if (Info.hasWorkItemIDY()) {
2028  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2029  Info.setWorkItemIDY(Arg);
2030  }
2031 
2032  if (Info.hasWorkItemIDZ())
2033  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2034 }
2035 
2036 /// Allocate implicit function VGPR arguments in fixed registers.
2038  CCState &CCInfo, MachineFunction &MF,
2039  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2040  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2041  if (!Reg)
2042  report_fatal_error("failed to allocated VGPR for implicit arguments");
2043 
2044  const unsigned Mask = 0x3ff;
2045  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2046  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2047  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2048 }
2049 
2051  CCState &CCInfo,
2052  MachineFunction &MF,
2053  const SIRegisterInfo &TRI,
2054  SIMachineFunctionInfo &Info) const {
2055  auto &ArgInfo = Info.getArgInfo();
2056 
2057  // TODO: Unify handling with private memory pointers.
2058  if (Info.hasDispatchPtr())
2059  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2060 
2061  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
2062  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2063 
2064  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2065  // constant offset from the kernarg segment.
2066  if (Info.hasImplicitArgPtr())
2067  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2068 
2069  if (Info.hasDispatchID())
2070  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2071 
2072  // flat_scratch_init is not applicable for non-kernel functions.
2073 
2074  if (Info.hasWorkGroupIDX())
2075  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2076 
2077  if (Info.hasWorkGroupIDY())
2078  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2079 
2080  if (Info.hasWorkGroupIDZ())
2081  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2082 
2083  if (Info.hasLDSKernelId())
2084  allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2085 }
2086 
2087 // Allocate special inputs passed in user SGPRs.
2089  MachineFunction &MF,
2090  const SIRegisterInfo &TRI,
2091  SIMachineFunctionInfo &Info) const {
2092  if (Info.hasImplicitBufferPtr()) {
2093  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2094  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2095  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2096  }
2097 
2098  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2099  if (Info.hasPrivateSegmentBuffer()) {
2100  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2101  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2102  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2103  }
2104 
2105  if (Info.hasDispatchPtr()) {
2106  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2107  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2108  CCInfo.AllocateReg(DispatchPtrReg);
2109  }
2110 
2111  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
2112  Register QueuePtrReg = Info.addQueuePtr(TRI);
2113  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2114  CCInfo.AllocateReg(QueuePtrReg);
2115  }
2116 
2117  if (Info.hasKernargSegmentPtr()) {
2119  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2120  CCInfo.AllocateReg(InputPtrReg);
2121 
2122  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2124  }
2125 
2126  if (Info.hasDispatchID()) {
2127  Register DispatchIDReg = Info.addDispatchID(TRI);
2128  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2129  CCInfo.AllocateReg(DispatchIDReg);
2130  }
2131 
2132  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2133  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2134  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2135  CCInfo.AllocateReg(FlatScratchInitReg);
2136  }
2137 
2138  if (Info.hasLDSKernelId()) {
2139  Register Reg = Info.addLDSKernelId();
2140  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2141  CCInfo.AllocateReg(Reg);
2142  }
2143 
2144  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2145  // these from the dispatch pointer.
2146 }
2147 
2148 // Allocate special input registers that are initialized per-wave.
2150  MachineFunction &MF,
2152  CallingConv::ID CallConv,
2153  bool IsShader) const {
2154  if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2155  // Note: user SGPRs are handled by the front-end for graphics shaders
2156  // Pad up the used user SGPRs with dead inputs.
2157  unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2158 
2159  // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2160  // rely on it to reach 16 since if we end up having no stack usage, it will
2161  // not really be added.
2162  unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2163  Info.hasWorkGroupIDY() +
2164  Info.hasWorkGroupIDZ() +
2165  Info.hasWorkGroupInfo();
2166  for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2167  Register Reg = Info.addReservedUserSGPR();
2168  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2169  CCInfo.AllocateReg(Reg);
2170  }
2171  }
2172 
2173  if (Info.hasWorkGroupIDX()) {
2174  Register Reg = Info.addWorkGroupIDX();
2175  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2176  CCInfo.AllocateReg(Reg);
2177  }
2178 
2179  if (Info.hasWorkGroupIDY()) {
2180  Register Reg = Info.addWorkGroupIDY();
2181  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2182  CCInfo.AllocateReg(Reg);
2183  }
2184 
2185  if (Info.hasWorkGroupIDZ()) {
2186  Register Reg = Info.addWorkGroupIDZ();
2187  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2188  CCInfo.AllocateReg(Reg);
2189  }
2190 
2191  if (Info.hasWorkGroupInfo()) {
2192  Register Reg = Info.addWorkGroupInfo();
2193  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2194  CCInfo.AllocateReg(Reg);
2195  }
2196 
2197  if (Info.hasPrivateSegmentWaveByteOffset()) {
2198  // Scratch wave offset passed in system SGPR.
2199  unsigned PrivateSegmentWaveByteOffsetReg;
2200 
2201  if (IsShader) {
2202  PrivateSegmentWaveByteOffsetReg =
2203  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2204 
2205  // This is true if the scratch wave byte offset doesn't have a fixed
2206  // location.
2207  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2208  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2209  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2210  }
2211  } else
2212  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2213 
2214  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2215  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2216  }
2217 
2218  assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2219  Info.getNumPreloadedSGPRs() >= 16);
2220 }
2221 
2223  MachineFunction &MF,
2224  const SIRegisterInfo &TRI,
2226  // Now that we've figured out where the scratch register inputs are, see if
2227  // should reserve the arguments and use them directly.
2228  MachineFrameInfo &MFI = MF.getFrameInfo();
2229  bool HasStackObjects = MFI.hasStackObjects();
2230  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2231 
2232  // Record that we know we have non-spill stack objects so we don't need to
2233  // check all stack objects later.
2234  if (HasStackObjects)
2235  Info.setHasNonSpillStackObjects(true);
2236 
2237  // Everything live out of a block is spilled with fast regalloc, so it's
2238  // almost certain that spilling will be required.
2239  if (TM.getOptLevel() == CodeGenOpt::None)
2240  HasStackObjects = true;
2241 
2242  // For now assume stack access is needed in any callee functions, so we need
2243  // the scratch registers to pass in.
2244  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2245 
2246  if (!ST.enableFlatScratch()) {
2247  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2248  // If we have stack objects, we unquestionably need the private buffer
2249  // resource. For the Code Object V2 ABI, this will be the first 4 user
2250  // SGPR inputs. We can reserve those and use them directly.
2251 
2252  Register PrivateSegmentBufferReg =
2254  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2255  } else {
2256  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2257  // We tentatively reserve the last registers (skipping the last registers
2258  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2259  // we'll replace these with the ones immediately after those which were
2260  // really allocated. In the prologue copies will be inserted from the
2261  // argument to these reserved registers.
2262 
2263  // Without HSA, relocations are used for the scratch pointer and the
2264  // buffer resource setup is always inserted in the prologue. Scratch wave
2265  // offset is still in an input SGPR.
2266  Info.setScratchRSrcReg(ReservedBufferReg);
2267  }
2268  }
2269 
2271 
2272  // For entry functions we have to set up the stack pointer if we use it,
2273  // whereas non-entry functions get this "for free". This means there is no
2274  // intrinsic advantage to using S32 over S34 in cases where we do not have
2275  // calls but do need a frame pointer (i.e. if we are requested to have one
2276  // because frame pointer elimination is disabled). To keep things simple we
2277  // only ever use S32 as the call ABI stack pointer, and so using it does not
2278  // imply we need a separate frame pointer.
2279  //
2280  // Try to use s32 as the SP, but move it if it would interfere with input
2281  // arguments. This won't work with calls though.
2282  //
2283  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2284  // registers.
2285  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2286  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2287  } else {
2289 
2290  if (MFI.hasCalls())
2291  report_fatal_error("call in graphics shader with too many input SGPRs");
2292 
2293  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2294  if (!MRI.isLiveIn(Reg)) {
2295  Info.setStackPtrOffsetReg(Reg);
2296  break;
2297  }
2298  }
2299 
2300  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2301  report_fatal_error("failed to find register for SP");
2302  }
2303 
2304  // hasFP should be accurate for entry functions even before the frame is
2305  // finalized, because it does not rely on the known stack size, only
2306  // properties like whether variable sized objects are present.
2307  if (ST.getFrameLowering()->hasFP(MF)) {
2308  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2309  }
2310 }
2311 
2314  return !Info->isEntryFunction();
2315 }
2316 
2318 
2319 }
2320 
2322  MachineBasicBlock *Entry,
2323  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2325 
2326  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2327  if (!IStart)
2328  return;
2329 
2330  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2331  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2332  MachineBasicBlock::iterator MBBI = Entry->begin();
2333  for (const MCPhysReg *I = IStart; *I; ++I) {
2334  const TargetRegisterClass *RC = nullptr;
2335  if (AMDGPU::SReg_64RegClass.contains(*I))
2336  RC = &AMDGPU::SGPR_64RegClass;
2337  else if (AMDGPU::SReg_32RegClass.contains(*I))
2338  RC = &AMDGPU::SGPR_32RegClass;
2339  else
2340  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2341 
2342  Register NewVR = MRI->createVirtualRegister(RC);
2343  // Create copy from CSR to a virtual register.
2344  Entry->addLiveIn(*I);
2345  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2346  .addReg(*I);
2347 
2348  // Insert the copy-back instructions right before the terminator.
2349  for (auto *Exit : Exits)
2350  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2351  TII->get(TargetOpcode::COPY), *I)
2352  .addReg(NewVR);
2353  }
2354 }
2355 
2357  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2358  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2359  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2361 
2362  MachineFunction &MF = DAG.getMachineFunction();
2363  const Function &Fn = MF.getFunction();
2364  FunctionType *FType = MF.getFunction().getFunctionType();
2366 
2367  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2368  DiagnosticInfoUnsupported NoGraphicsHSA(
2369  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2370  DAG.getContext()->diagnose(NoGraphicsHSA);
2371  return DAG.getEntryNode();
2372  }
2373 
2374  Info->allocateKnownAddressLDSGlobal(Fn);
2375 
2378  BitVector Skipped(Ins.size());
2379  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2380  *DAG.getContext());
2381 
2382  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2383  bool IsKernel = AMDGPU::isKernel(CallConv);
2384  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2385 
2386  if (IsGraphics) {
2387  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2388  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2389  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2390  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2391  !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2392  !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2393  }
2394 
2395  if (CallConv == CallingConv::AMDGPU_PS) {
2396  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2397 
2398  // At least one interpolation mode must be enabled or else the GPU will
2399  // hang.
2400  //
2401  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2402  // set PSInputAddr, the user wants to enable some bits after the compilation
2403  // based on run-time states. Since we can't know what the final PSInputEna
2404  // will look like, so we shouldn't do anything here and the user should take
2405  // responsibility for the correct programming.
2406  //
2407  // Otherwise, the following restrictions apply:
2408  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2409  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2410  // enabled too.
2411  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2412  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2413  CCInfo.AllocateReg(AMDGPU::VGPR0);
2414  CCInfo.AllocateReg(AMDGPU::VGPR1);
2415  Info->markPSInputAllocated(0);
2416  Info->markPSInputEnabled(0);
2417  }
2418  if (Subtarget->isAmdPalOS()) {
2419  // For isAmdPalOS, the user does not enable some bits after compilation
2420  // based on run-time states; the register values being generated here are
2421  // the final ones set in hardware. Therefore we need to apply the
2422  // workaround to PSInputAddr and PSInputEnable together. (The case where
2423  // a bit is set in PSInputAddr but not PSInputEnable is where the
2424  // frontend set up an input arg for a particular interpolation mode, but
2425  // nothing uses that input arg. Really we should have an earlier pass
2426  // that removes such an arg.)
2427  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2428  if ((PsInputBits & 0x7F) == 0 ||
2429  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2430  Info->markPSInputEnabled(
2431  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2432  }
2433  } else if (IsKernel) {
2434  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2435  } else {
2436  Splits.append(Ins.begin(), Ins.end());
2437  }
2438 
2439  if (IsEntryFunc) {
2440  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2441  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2442  } else if (!IsGraphics) {
2443  // For the fixed ABI, pass workitem IDs in the last argument register.
2444  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2445  }
2446 
2447  if (IsKernel) {
2449  } else {
2450  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2451  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2452  }
2453 
2454  SmallVector<SDValue, 16> Chains;
2455 
2456  // FIXME: This is the minimum kernel argument alignment. We should improve
2457  // this to the maximum alignment of the arguments.
2458  //
2459  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2460  // kern arg offset.
2461  const Align KernelArgBaseAlign = Align(16);
2462 
2463  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2464  const ISD::InputArg &Arg = Ins[i];
2465  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2466  InVals.push_back(DAG.getUNDEF(Arg.VT));
2467  continue;
2468  }
2469 
2470  CCValAssign &VA = ArgLocs[ArgIdx++];
2471  MVT VT = VA.getLocVT();
2472 
2473  if (IsEntryFunc && VA.isMemLoc()) {
2474  VT = Ins[i].VT;
2475  EVT MemVT = VA.getLocVT();
2476 
2477  const uint64_t Offset = VA.getLocMemOffset();
2478  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2479 
2480  if (Arg.Flags.isByRef()) {
2481  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2482 
2483  const GCNTargetMachine &TM =
2484  static_cast<const GCNTargetMachine &>(getTargetMachine());
2485  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2486  Arg.Flags.getPointerAddrSpace())) {
2488  Arg.Flags.getPointerAddrSpace());
2489  }
2490 
2491  InVals.push_back(Ptr);
2492  continue;
2493  }
2494 
2495  SDValue Arg = lowerKernargMemParameter(
2496  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2497  Chains.push_back(Arg.getValue(1));
2498 
2499  auto *ParamTy =
2500  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2501  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2502  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2503  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2504  // On SI local pointers are just offsets into LDS, so they are always
2505  // less than 16-bits. On CI and newer they could potentially be
2506  // real pointers, so we can't guarantee their size.
2507  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2508  DAG.getValueType(MVT::i16));
2509  }
2510 
2511  InVals.push_back(Arg);
2512  continue;
2513  } else if (!IsEntryFunc && VA.isMemLoc()) {
2514  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2515  InVals.push_back(Val);
2516  if (!Arg.Flags.isByVal())
2517  Chains.push_back(Val.getValue(1));
2518  continue;
2519  }
2520 
2521  assert(VA.isRegLoc() && "Parameter must be in a register!");
2522 
2523  Register Reg = VA.getLocReg();
2524  const TargetRegisterClass *RC = nullptr;
2525  if (AMDGPU::VGPR_32RegClass.contains(Reg))
2526  RC = &AMDGPU::VGPR_32RegClass;
2527  else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2528  RC = &AMDGPU::SGPR_32RegClass;
2529  else
2530  llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2531  EVT ValVT = VA.getValVT();
2532 
2533  Reg = MF.addLiveIn(Reg, RC);
2534  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2535 
2536  if (Arg.Flags.isSRet()) {
2537  // The return object should be reasonably addressable.
2538 
2539  // FIXME: This helps when the return is a real sret. If it is a
2540  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2541  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2542  unsigned NumBits
2544  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2545  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2546  }
2547 
2548  // If this is an 8 or 16-bit value, it is really passed promoted
2549  // to 32 bits. Insert an assert[sz]ext to capture this, then
2550  // truncate to the right size.
2551  switch (VA.getLocInfo()) {
2552  case CCValAssign::Full:
2553  break;
2554  case CCValAssign::BCvt:
2555  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2556  break;
2557  case CCValAssign::SExt:
2558  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2559  DAG.getValueType(ValVT));
2560  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2561  break;
2562  case CCValAssign::ZExt:
2563  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2564  DAG.getValueType(ValVT));
2565  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2566  break;
2567  case CCValAssign::AExt:
2568  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2569  break;
2570  default:
2571  llvm_unreachable("Unknown loc info!");
2572  }
2573 
2574  InVals.push_back(Val);
2575  }
2576 
2577  // Start adding system SGPRs.
2578  if (IsEntryFunc) {
2579  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2580  } else {
2581  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2582  if (!IsGraphics)
2583  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2584  }
2585 
2586  auto &ArgUsageInfo =
2588  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2589 
2590  unsigned StackArgSize = CCInfo.getNextStackOffset();
2591  Info->setBytesInStackArgArea(StackArgSize);
2592 
2593  return Chains.empty() ? Chain :
2594  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2595 }
2596 
2597 // TODO: If return values can't fit in registers, we should return as many as
2598 // possible in registers before passing on stack.
2600  CallingConv::ID CallConv,
2601  MachineFunction &MF, bool IsVarArg,
2602  const SmallVectorImpl<ISD::OutputArg> &Outs,
2603  LLVMContext &Context) const {
2604  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2605  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2606  // for shaders. Vector types should be explicitly handled by CC.
2607  if (AMDGPU::isEntryFunctionCC(CallConv))
2608  return true;
2609 
2611  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2612  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2613 }
2614 
2615 SDValue
2617  bool isVarArg,
2618  const SmallVectorImpl<ISD::OutputArg> &Outs,
2619  const SmallVectorImpl<SDValue> &OutVals,
2620  const SDLoc &DL, SelectionDAG &DAG) const {
2621  MachineFunction &MF = DAG.getMachineFunction();
2623 
2624  if (AMDGPU::isKernel(CallConv)) {
2625  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2626  OutVals, DL, DAG);
2627  }
2628 
2629  bool IsShader = AMDGPU::isShader(CallConv);
2630 
2631  Info->setIfReturnsVoid(Outs.empty());
2632  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2633 
2634  // CCValAssign - represent the assignment of the return value to a location.
2637 
2638  // CCState - Info about the registers and stack slots.
2639  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2640  *DAG.getContext());
2641 
2642  // Analyze outgoing return values.
2643  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2644 
2645  SDValue Flag;
2646  SmallVector<SDValue, 48> RetOps;
2647  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2648 
2649  // Copy the result values into the output registers.
2650  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2651  ++I, ++RealRVLocIdx) {
2652  CCValAssign &VA = RVLocs[I];
2653  assert(VA.isRegLoc() && "Can only return in registers!");
2654  // TODO: Partially return in registers if return values don't fit.
2655  SDValue Arg = OutVals[RealRVLocIdx];
2656 
2657  // Copied from other backends.
2658  switch (VA.getLocInfo()) {
2659  case CCValAssign::Full:
2660  break;
2661  case CCValAssign::BCvt:
2662  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2663  break;
2664  case CCValAssign::SExt:
2665  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2666  break;
2667  case CCValAssign::ZExt:
2668  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2669  break;
2670  case CCValAssign::AExt:
2671  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2672  break;
2673  default:
2674  llvm_unreachable("Unknown loc info!");
2675  }
2676 
2677  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2678  Flag = Chain.getValue(1);
2679  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2680  }
2681 
2682  // FIXME: Does sret work properly?
2683  if (!Info->isEntryFunction()) {
2684  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2685  const MCPhysReg *I =
2686  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2687  if (I) {
2688  for (; *I; ++I) {
2689  if (AMDGPU::SReg_64RegClass.contains(*I))
2690  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2691  else if (AMDGPU::SReg_32RegClass.contains(*I))
2692  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2693  else
2694  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2695  }
2696  }
2697  }
2698 
2699  // Update chain and glue.
2700  RetOps[0] = Chain;
2701  if (Flag.getNode())
2702  RetOps.push_back(Flag);
2703 
2704  unsigned Opc = AMDGPUISD::ENDPGM;
2705  if (!IsWaveEnd)
2707  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2708 }
2709 
2711  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2712  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2713  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2714  SDValue ThisVal) const {
2715  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2716 
2717  // Assign locations to each value returned by this call.
2719  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2720  *DAG.getContext());
2721  CCInfo.AnalyzeCallResult(Ins, RetCC);
2722 
2723  // Copy all of the result registers out of their specified physreg.
2724  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2725  CCValAssign VA = RVLocs[i];
2726  SDValue Val;
2727 
2728  if (VA.isRegLoc()) {
2729  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2730  Chain = Val.getValue(1);
2731  InFlag = Val.getValue(2);
2732  } else if (VA.isMemLoc()) {
2733  report_fatal_error("TODO: return values in memory");
2734  } else
2735  llvm_unreachable("unknown argument location type");
2736 
2737  switch (VA.getLocInfo()) {
2738  case CCValAssign::Full:
2739  break;
2740  case CCValAssign::BCvt:
2741  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2742  break;
2743  case CCValAssign::ZExt:
2744  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2745  DAG.getValueType(VA.getValVT()));
2746  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2747  break;
2748  case CCValAssign::SExt:
2749  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2750  DAG.getValueType(VA.getValVT()));
2751  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2752  break;
2753  case CCValAssign::AExt:
2754  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2755  break;
2756  default:
2757  llvm_unreachable("Unknown loc info!");
2758  }
2759 
2760  InVals.push_back(Val);
2761  }
2762 
2763  return Chain;
2764 }
2765 
2766 // Add code to pass special inputs required depending on used features separate
2767 // from the explicit user arguments present in the IR.
2769  CallLoweringInfo &CLI,
2770  CCState &CCInfo,
2771  const SIMachineFunctionInfo &Info,
2772  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2773  SmallVectorImpl<SDValue> &MemOpChains,
2774  SDValue Chain) const {
2775  // If we don't have a call site, this was a call inserted by
2776  // legalization. These can never use special inputs.
2777  if (!CLI.CB)
2778  return;
2779 
2780  SelectionDAG &DAG = CLI.DAG;
2781  const SDLoc &DL = CLI.DL;
2782  const Function &F = DAG.getMachineFunction().getFunction();
2783 
2784  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2785  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2786 
2787  const AMDGPUFunctionArgInfo *CalleeArgInfo
2789  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2790  auto &ArgUsageInfo =
2792  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2793  }
2794 
2795  // TODO: Unify with private memory register handling. This is complicated by
2796  // the fact that at least in kernels, the input argument is not necessarily
2797  // in the same location as the input.
2798  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2800  {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2801  {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2802  {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2803  {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2804  {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2805  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2806  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2807  {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2808  };
2809 
2810  for (auto Attr : ImplicitAttrs) {
2811  const ArgDescriptor *OutgoingArg;
2812  const TargetRegisterClass *ArgRC;
2813  LLT ArgTy;
2814 
2815  AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2816 
2817  // If the callee does not use the attribute value, skip copying the value.
2818  if (CLI.CB->hasFnAttr(Attr.second))
2819  continue;
2820 
2821  std::tie(OutgoingArg, ArgRC, ArgTy) =
2822  CalleeArgInfo->getPreloadedValue(InputID);
2823  if (!OutgoingArg)
2824  continue;
2825 
2826  const ArgDescriptor *IncomingArg;
2827  const TargetRegisterClass *IncomingArgRC;
2828  LLT Ty;
2829  std::tie(IncomingArg, IncomingArgRC, Ty) =
2830  CallerArgInfo.getPreloadedValue(InputID);
2831  assert(IncomingArgRC == ArgRC);
2832 
2833  // All special arguments are ints for now.
2834  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2835  SDValue InputReg;
2836 
2837  if (IncomingArg) {
2838  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2839  } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2840  // The implicit arg ptr is special because it doesn't have a corresponding
2841  // input for kernels, and is computed from the kernarg segment pointer.
2842  InputReg = getImplicitArgPtr(DAG, DL);
2843  } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
2845  if (Id.has_value()) {
2846  InputReg = DAG.getConstant(Id.value(), DL, ArgVT);
2847  } else {
2848  InputReg = DAG.getUNDEF(ArgVT);
2849  }
2850  } else {
2851  // We may have proven the input wasn't needed, although the ABI is
2852  // requiring it. We just need to allocate the register appropriately.
2853  InputReg = DAG.getUNDEF(ArgVT);
2854  }
2855 
2856  if (OutgoingArg->isRegister()) {
2857  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2858  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2859  report_fatal_error("failed to allocate implicit input argument");
2860  } else {
2861  unsigned SpecialArgOffset =
2862  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2863  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2864  SpecialArgOffset);
2865  MemOpChains.push_back(ArgStore);
2866  }
2867  }
2868 
2869  // Pack workitem IDs into a single register or pass it as is if already
2870  // packed.
2871  const ArgDescriptor *OutgoingArg;
2872  const TargetRegisterClass *ArgRC;
2873  LLT Ty;
2874 
2875  std::tie(OutgoingArg, ArgRC, Ty) =
2877  if (!OutgoingArg)
2878  std::tie(OutgoingArg, ArgRC, Ty) =
2880  if (!OutgoingArg)
2881  std::tie(OutgoingArg, ArgRC, Ty) =
2883  if (!OutgoingArg)
2884  return;
2885 
2886  const ArgDescriptor *IncomingArgX = std::get<0>(
2888  const ArgDescriptor *IncomingArgY = std::get<0>(
2890  const ArgDescriptor *IncomingArgZ = std::get<0>(
2892 
2893  SDValue InputReg;
2894  SDLoc SL;
2895 
2896  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2897  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2898  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2899 
2900  // If incoming ids are not packed we need to pack them.
2901  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2902  NeedWorkItemIDX) {
2903  if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2904  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2905  } else {
2906  InputReg = DAG.getConstant(0, DL, MVT::i32);
2907  }
2908  }
2909 
2910  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2911  NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
2912  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2913  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2914  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2915  InputReg = InputReg.getNode() ?
2916  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2917  }
2918 
2919  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2920  NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
2921  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2922  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2923  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2924  InputReg = InputReg.getNode() ?
2925  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2926  }
2927 
2928  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2929  if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2930  // We're in a situation where the outgoing function requires the workitem
2931  // ID, but the calling function does not have it (e.g a graphics function
2932  // calling a C calling convention function). This is illegal, but we need
2933  // to produce something.
2934  InputReg = DAG.getUNDEF(MVT::i32);
2935  } else {
2936  // Workitem ids are already packed, any of present incoming arguments
2937  // will carry all required fields.
2939  IncomingArgX ? *IncomingArgX :
2940  IncomingArgY ? *IncomingArgY :
2941  *IncomingArgZ, ~0u);
2942  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2943  }
2944  }
2945 
2946  if (OutgoingArg->isRegister()) {
2947  if (InputReg)
2948  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2949 
2950  CCInfo.AllocateReg(OutgoingArg->getRegister());
2951  } else {
2952  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2953  if (InputReg) {
2954  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2955  SpecialArgOffset);
2956  MemOpChains.push_back(ArgStore);
2957  }
2958  }
2959 }
2960 
2962  return CC == CallingConv::Fast;
2963 }
2964 
2965 /// Return true if we might ever do TCO for calls with this calling convention.
2967  switch (CC) {
2968  case CallingConv::C:
2970  return true;
2971  default:
2972  return canGuaranteeTCO(CC);
2973  }
2974 }
2975 
2977  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2978  const SmallVectorImpl<ISD::OutputArg> &Outs,
2979  const SmallVectorImpl<SDValue> &OutVals,
2980  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2981  if (!mayTailCallThisCC(CalleeCC))
2982  return false;
2983 
2984  // For a divergent call target, we need to do a waterfall loop over the
2985  // possible callees which precludes us from using a simple jump.
2986  if (Callee->isDivergent())
2987  return false;
2988 
2989  MachineFunction &MF = DAG.getMachineFunction();
2990  const Function &CallerF = MF.getFunction();
2991  CallingConv::ID CallerCC = CallerF.getCallingConv();
2993  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2994 
2995  // Kernels aren't callable, and don't have a live in return address so it
2996  // doesn't make sense to do a tail call with entry functions.
2997  if (!CallerPreserved)
2998  return false;
2999 
3000  bool CCMatch = CallerCC == CalleeCC;
3001 
3003  if (canGuaranteeTCO(CalleeCC) && CCMatch)
3004  return true;
3005  return false;
3006  }
3007 
3008  // TODO: Can we handle var args?
3009  if (IsVarArg)
3010  return false;
3011 
3012  for (const Argument &Arg : CallerF.args()) {
3013  if (Arg.hasByValAttr())
3014  return false;
3015  }
3016 
3017  LLVMContext &Ctx = *DAG.getContext();
3018 
3019  // Check that the call results are passed in the same way.
3020  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3021  CCAssignFnForCall(CalleeCC, IsVarArg),
3022  CCAssignFnForCall(CallerCC, IsVarArg)))
3023  return false;
3024 
3025  // The callee has to preserve all registers the caller needs to preserve.
3026  if (!CCMatch) {
3027  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3028  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3029  return false;
3030  }
3031 
3032  // Nothing more to check if the callee is taking no arguments.
3033  if (Outs.empty())
3034  return true;
3035 
3037  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3038 
3039  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3040 
3041  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3042  // If the stack arguments for this call do not fit into our own save area then
3043  // the call cannot be made tail.
3044  // TODO: Is this really necessary?
3045  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3046  return false;
3047 
3048  const MachineRegisterInfo &MRI = MF.getRegInfo();
3049  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3050 }
3051 
3053  if (!CI->isTailCall())
3054  return false;
3055 
3056  const Function *ParentFn = CI->getParent()->getParent();
3057  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3058  return false;
3059  return true;
3060 }
3061 
3062 // The wave scratch offset register is used as the global base pointer.
3064  SmallVectorImpl<SDValue> &InVals) const {
3065  SelectionDAG &DAG = CLI.DAG;
3066  const SDLoc &DL = CLI.DL;
3068  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3070  SDValue Chain = CLI.Chain;
3071  SDValue Callee = CLI.Callee;
3072  bool &IsTailCall = CLI.IsTailCall;
3073  CallingConv::ID CallConv = CLI.CallConv;
3074  bool IsVarArg = CLI.IsVarArg;
3075  bool IsSibCall = false;
3076  bool IsThisReturn = false;
3077  MachineFunction &MF = DAG.getMachineFunction();
3078 
3079  if (Callee.isUndef() || isNullConstant(Callee)) {
3080  if (!CLI.IsTailCall) {
3081  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3082  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3083  }
3084 
3085  return Chain;
3086  }
3087 
3088  if (IsVarArg) {
3089  return lowerUnhandledCall(CLI, InVals,
3090  "unsupported call to variadic function ");
3091  }
3092 
3093  if (!CLI.CB)
3094  report_fatal_error("unsupported libcall legalization");
3095 
3096  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3097  return lowerUnhandledCall(CLI, InVals,
3098  "unsupported required tail call to function ");
3099  }
3100 
3101  if (AMDGPU::isShader(CallConv)) {
3102  // Note the issue is with the CC of the called function, not of the call
3103  // itself.
3104  return lowerUnhandledCall(CLI, InVals,
3105  "unsupported call to a shader function ");
3106  }
3107 
3109  CallConv != CallingConv::AMDGPU_Gfx) {
3110  // Only allow calls with specific calling conventions.
3111  return lowerUnhandledCall(CLI, InVals,
3112  "unsupported calling convention for call from "
3113  "graphics shader of function ");
3114  }
3115 
3116  if (IsTailCall) {
3117  IsTailCall = isEligibleForTailCallOptimization(
3118  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3119  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3120  report_fatal_error("failed to perform tail call elimination on a call "
3121  "site marked musttail");
3122  }
3123 
3124  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3125 
3126  // A sibling call is one where we're under the usual C ABI and not planning
3127  // to change that but can still do a tail call:
3128  if (!TailCallOpt && IsTailCall)
3129  IsSibCall = true;
3130 
3131  if (IsTailCall)
3132  ++NumTailCalls;
3133  }
3134 
3137  SmallVector<SDValue, 8> MemOpChains;
3138 
3139  // Analyze operands of the call, assigning locations to each operand.
3141  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3142  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3143 
3144  if (CallConv != CallingConv::AMDGPU_Gfx) {
3145  // With a fixed ABI, allocate fixed registers before user arguments.
3146  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3147  }
3148 
3149  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3150 
3151  // Get a count of how many bytes are to be pushed on the stack.
3152  unsigned NumBytes = CCInfo.getNextStackOffset();
3153 
3154  if (IsSibCall) {
3155  // Since we're not changing the ABI to make this a tail call, the memory
3156  // operands are already available in the caller's incoming argument space.
3157  NumBytes = 0;
3158  }
3159 
3160  // FPDiff is the byte offset of the call's argument area from the callee's.
3161  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3162  // by this amount for a tail call. In a sibling call it must be 0 because the
3163  // caller will deallocate the entire stack and the callee still expects its
3164  // arguments to begin at SP+0. Completely unused for non-tail calls.
3165  int32_t FPDiff = 0;
3166  MachineFrameInfo &MFI = MF.getFrameInfo();
3167 
3168  // Adjust the stack pointer for the new arguments...
3169  // These operations are automatically eliminated by the prolog/epilog pass
3170  if (!IsSibCall) {
3171  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3172 
3173  if (!Subtarget->enableFlatScratch()) {
3174  SmallVector<SDValue, 4> CopyFromChains;
3175 
3176  // In the HSA case, this should be an identity copy.
3177  SDValue ScratchRSrcReg
3178  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3179  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3180  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3181  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3182  }
3183  }
3184 
3185  MVT PtrVT = MVT::i32;
3186 
3187  // Walk the register/memloc assignments, inserting copies/loads.
3188  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3189  CCValAssign &VA = ArgLocs[i];
3190  SDValue Arg = OutVals[i];
3191 
3192  // Promote the value if needed.
3193  switch (VA.getLocInfo()) {
3194  case CCValAssign::Full:
3195  break;
3196  case CCValAssign::BCvt:
3197  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3198  break;
3199  case CCValAssign::ZExt:
3200  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3201  break;
3202  case CCValAssign::SExt:
3203  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3204  break;
3205  case CCValAssign::AExt:
3206  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3207  break;
3208  case CCValAssign::FPExt:
3209  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3210  break;
3211  default:
3212  llvm_unreachable("Unknown loc info!");
3213  }
3214 
3215  if (VA.isRegLoc()) {
3216  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3217  } else {
3218  assert(VA.isMemLoc());
3219 
3220  SDValue DstAddr;
3221  MachinePointerInfo DstInfo;
3222 
3223  unsigned LocMemOffset = VA.getLocMemOffset();
3224  int32_t Offset = LocMemOffset;
3225 
3226  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3227  MaybeAlign Alignment;
3228 
3229  if (IsTailCall) {
3230  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3231  unsigned OpSize = Flags.isByVal() ?
3232  Flags.getByValSize() : VA.getValVT().getStoreSize();
3233 
3234  // FIXME: We can have better than the minimum byval required alignment.
3235  Alignment =
3236  Flags.isByVal()
3237  ? Flags.getNonZeroByValAlign()
3238  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3239 
3240  Offset = Offset + FPDiff;
3241  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3242 
3243  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3244  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3245 
3246  // Make sure any stack arguments overlapping with where we're storing
3247  // are loaded before this eventual operation. Otherwise they'll be
3248  // clobbered.
3249 
3250  // FIXME: Why is this really necessary? This seems to just result in a
3251  // lot of code to copy the stack and write them back to the same
3252  // locations, which are supposed to be immutable?
3253  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3254  } else {
3255  // Stores to the argument stack area are relative to the stack pointer.
3256  SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3257  MVT::i32);
3258  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3259  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3260  Alignment =
3261  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3262  }
3263 
3264  if (Outs[i].Flags.isByVal()) {
3265  SDValue SizeNode =
3266  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3267  SDValue Cpy =
3268  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3269  Outs[i].Flags.getNonZeroByValAlign(),
3270  /*isVol = */ false, /*AlwaysInline = */ true,
3271  /*isTailCall = */ false, DstInfo,
3273 
3274  MemOpChains.push_back(Cpy);
3275  } else {
3276  SDValue Store =
3277  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3278  MemOpChains.push_back(Store);
3279  }
3280  }
3281  }
3282 
3283  if (!MemOpChains.empty())
3284  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3285 
3286  // Build a sequence of copy-to-reg nodes chained together with token chain
3287  // and flag operands which copy the outgoing args into the appropriate regs.
3288  SDValue InFlag;
3289  for (auto &RegToPass : RegsToPass) {
3290  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3291  RegToPass.second, InFlag);
3292  InFlag = Chain.getValue(1);
3293  }
3294 
3295 
3296  // We don't usually want to end the call-sequence here because we would tidy
3297  // the frame up *after* the call, however in the ABI-changing tail-call case
3298  // we've carefully laid out the parameters so that when sp is reset they'll be
3299  // in the correct location.
3300  if (IsTailCall && !IsSibCall) {
3301  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InFlag, DL);
3302  InFlag = Chain.getValue(1);
3303  }
3304 
3305  std::vector<SDValue> Ops;
3306  Ops.push_back(Chain);
3307  Ops.push_back(Callee);
3308  // Add a redundant copy of the callee global which will not be legalized, as
3309  // we need direct access to the callee later.
3310  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3311  const GlobalValue *GV = GSD->getGlobal();
3312  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3313  } else {
3314  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3315  }
3316 
3317  if (IsTailCall) {
3318  // Each tail call may have to adjust the stack by a different amount, so
3319  // this information must travel along with the operation for eventual
3320  // consumption by emitEpilogue.
3321  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3322  }
3323 
3324  // Add argument registers to the end of the list so that they are known live
3325  // into the call.
3326  for (auto &RegToPass : RegsToPass) {
3327  Ops.push_back(DAG.getRegister(RegToPass.first,
3328  RegToPass.second.getValueType()));
3329  }
3330 
3331  // Add a register mask operand representing the call-preserved registers.
3332 
3333  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3334  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3335  assert(Mask && "Missing call preserved mask for calling convention");
3336  Ops.push_back(DAG.getRegisterMask(Mask));
3337 
3338  if (InFlag.getNode())
3339  Ops.push_back(InFlag);
3340 
3341  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3342 
3343  // If we're doing a tall call, use a TC_RETURN here rather than an
3344  // actual call instruction.
3345  if (IsTailCall) {
3346  MFI.setHasTailCall();
3347  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3348  }
3349 
3350  // Returns a chain and a flag for retval copy to use.
3351  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3352  Chain = Call.getValue(0);
3353  InFlag = Call.getValue(1);
3354 
3355  uint64_t CalleePopBytes = NumBytes;
3356  Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InFlag, DL);
3357  if (!Ins.empty())
3358  InFlag = Chain.getValue(1);
3359 
3360  // Handle result values, copying them out of physregs into vregs that we
3361  // return.
3362  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3363  InVals, IsThisReturn,
3364  IsThisReturn ? OutVals[0] : SDValue());
3365 }
3366 
3367 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3368 // except for applying the wave size scale to the increment amount.
3370  SDValue Op, SelectionDAG &DAG) const {
3371  const MachineFunction &MF = DAG.getMachineFunction();
3373 
3374  SDLoc dl(Op);
3375  EVT VT = Op.getValueType();
3376  SDValue Tmp1 = Op;
3377  SDValue Tmp2 = Op.getValue(1);
3378  SDValue Tmp3 = Op.getOperand(2);
3379  SDValue Chain = Tmp1.getOperand(0);
3380 
3381  Register SPReg = Info->getStackPtrOffsetReg();
3382 
3383  // Chain the dynamic stack allocation so that it doesn't modify the stack
3384  // pointer when other instructions are using the stack.
3385  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3386 
3387  SDValue Size = Tmp2.getOperand(1);
3388  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3389  Chain = SP.getValue(1);
3390  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3391  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3392  const TargetFrameLowering *TFL = ST.getFrameLowering();
3393  unsigned Opc =
3394  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3395  ISD::ADD : ISD::SUB;
3396 
3397  SDValue ScaledSize = DAG.getNode(
3398  ISD::SHL, dl, VT, Size,
3399  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3400 
3401  Align StackAlign = TFL->getStackAlign();
3402  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3403  if (Alignment && *Alignment > StackAlign) {
3404  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3405  DAG.getConstant(-(uint64_t)Alignment->value()
3406  << ST.getWavefrontSizeLog2(),
3407  dl, VT));
3408  }
3409 
3410  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3411  Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3412 
3413  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3414 }
3415 
3417  SelectionDAG &DAG) const {
3418  // We only handle constant sizes here to allow non-entry block, static sized
3419  // allocas. A truly dynamic value is more difficult to support because we
3420  // don't know if the size value is uniform or not. If the size isn't uniform,
3421  // we would need to do a wave reduction to get the maximum size to know how
3422  // much to increment the uniform stack pointer.
3423  SDValue Size = Op.getOperand(1);
3424  if (isa<ConstantSDNode>(Size))
3425  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3426 
3428 }
3429 
3431  const MachineFunction &MF) const {
3433  .Case("m0", AMDGPU::M0)
3434  .Case("exec", AMDGPU::EXEC)
3435  .Case("exec_lo", AMDGPU::EXEC_LO)
3436  .Case("exec_hi", AMDGPU::EXEC_HI)
3437  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3438  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3439  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3440  .Default(Register());
3441 
3442  if (Reg == AMDGPU::NoRegister) {
3443  report_fatal_error(Twine("invalid register name \""
3444  + StringRef(RegName) + "\"."));
3445 
3446  }
3447 
3448  if (!Subtarget->hasFlatScrRegister() &&
3449  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3450  report_fatal_error(Twine("invalid register \""
3451  + StringRef(RegName) + "\" for subtarget."));
3452  }
3453 
3454  switch (Reg) {
3455  case AMDGPU::M0:
3456  case AMDGPU::EXEC_LO:
3457  case AMDGPU::EXEC_HI:
3458  case AMDGPU::FLAT_SCR_LO:
3459  case AMDGPU::FLAT_SCR_HI:
3460  if (VT.getSizeInBits() == 32)
3461  return Reg;
3462  break;
3463  case AMDGPU::EXEC:
3464  case AMDGPU::FLAT_SCR:
3465  if (VT.getSizeInBits() == 64)
3466  return Reg;
3467  break;
3468  default:
3469  llvm_unreachable("missing register type checking");
3470  }
3471 
3472  report_fatal_error(Twine("invalid type for register \""
3473  + StringRef(RegName) + "\"."));
3474 }
3475 
3476 // If kill is not the last instruction, split the block so kill is always a
3477 // proper terminator.
3480  MachineBasicBlock *BB) const {
3481  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3482  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3483  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3484  return SplitBB;
3485 }
3486 
3487 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3488 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3489 // be the first instruction in the remainder block.
3490 //
3491 /// \returns { LoopBody, Remainder }
3492 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3494  MachineFunction *MF = MBB.getParent();
3496 
3497  // To insert the loop we need to split the block. Move everything after this
3498  // point to a new block, and insert a new empty block between the two.
3500  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3502  ++MBBI;
3503 
3504  MF->insert(MBBI, LoopBB);
3505  MF->insert(MBBI, RemainderBB);
3506 
3507  LoopBB->addSuccessor(LoopBB);
3508  LoopBB->addSuccessor(RemainderBB);
3509 
3510  // Move the rest of the block into a new block.
3511  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3512 
3513  if (InstInLoop) {
3514  auto Next = std::next(I);
3515 
3516  // Move instruction to loop body.
3517  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3518 
3519  // Move the rest of the block.
3520  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3521  } else {
3522  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3523  }
3524 
3525  MBB.addSuccessor(LoopBB);
3526 
3527  return std::make_pair(LoopBB, RemainderBB);
3528 }
3529 
3530 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3532  MachineBasicBlock *MBB = MI.getParent();
3533  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3534  auto I = MI.getIterator();
3535  auto E = std::next(I);
3536 
3537  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3538  .addImm(0);
3539 
3540  MIBundleBuilder Bundler(*MBB, I, E);
3541  finalizeBundle(*MBB, Bundler.begin());
3542 }
3543 
3546  MachineBasicBlock *BB) const {
3547  const DebugLoc &DL = MI.getDebugLoc();
3548 
3549  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3550 
3551  MachineBasicBlock *LoopBB;
3552  MachineBasicBlock *RemainderBB;
3553  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3554 
3555  // Apparently kill flags are only valid if the def is in the same block?
3556  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3557  Src->setIsKill(false);
3558 
3559  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3560 
3561  MachineBasicBlock::iterator I = LoopBB->end();
3562 
3563  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3565 
3566  // Clear TRAP_STS.MEM_VIOL
3567  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3568  .addImm(0)
3569  .addImm(EncodedReg);
3570 
3572 
3573  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3574 
3575  // Load and check TRAP_STS.MEM_VIOL
3576  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3577  .addImm(EncodedReg);
3578 
3579  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3580  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3582  .addImm(0);
3583  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3584  .addMBB(LoopBB);
3585 
3586  return RemainderBB;
3587 }
3588 
3589 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3590 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3591 // will only do one iteration. In the worst case, this will loop 64 times.
3592 //
3593 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3596  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3597  const DebugLoc &DL, const MachineOperand &Idx,
3598  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3599  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3600  Register &SGPRIdxReg) {
3601 
3602  MachineFunction *MF = OrigBB.getParent();
3603  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3604  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3605  MachineBasicBlock::iterator I = LoopBB.begin();
3606 
3607  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3608  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3609  Register NewExec = MRI.createVirtualRegister(BoolRC);
3610  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3611  Register CondReg = MRI.createVirtualRegister(BoolRC);
3612 
3613  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3614  .addReg(InitReg)
3615  .addMBB(&OrigBB)
3616  .addReg(ResultReg)
3617  .addMBB(&LoopBB);
3618 
3619  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3620  .addReg(InitSaveExecReg)
3621  .addMBB(&OrigBB)
3622  .addReg(NewExec)
3623  .addMBB(&LoopBB);
3624 
3625  // Read the next variant <- also loop target.
3626  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3627  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3628 
3629  // Compare the just read M0 value to all possible Idx values.
3630  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3631  .addReg(CurrentIdxReg)
3632  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3633 
3634  // Update EXEC, save the original EXEC value to VCC.
3635  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3636  : AMDGPU::S_AND_SAVEEXEC_B64),
3637  NewExec)
3638  .addReg(CondReg, RegState::Kill);
3639 
3640  MRI.setSimpleHint(NewExec, CondReg);
3641 
3642  if (UseGPRIdxMode) {
3643  if (Offset == 0) {
3644  SGPRIdxReg = CurrentIdxReg;
3645  } else {
3646  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3647  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3648  .addReg(CurrentIdxReg, RegState::Kill)
3649  .addImm(Offset);
3650  }
3651  } else {
3652  // Move index from VCC into M0
3653  if (Offset == 0) {
3654  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3655  .addReg(CurrentIdxReg, RegState::Kill);
3656  } else {
3657  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3658  .addReg(CurrentIdxReg, RegState::Kill)
3659  .addImm(Offset);
3660  }
3661  }
3662 
3663  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3664  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3665  MachineInstr *InsertPt =
3666  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3667  : AMDGPU::S_XOR_B64_term), Exec)
3668  .addReg(Exec)
3669  .addReg(NewExec);
3670 
3671  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3672  // s_cbranch_scc0?
3673 
3674  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3675  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3676  .addMBB(&LoopBB);
3677 
3678  return InsertPt->getIterator();
3679 }
3680 
3681 // This has slightly sub-optimal regalloc when the source vector is killed by
3682 // the read. The register allocator does not understand that the kill is
3683 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3684 // subregister from it, using 1 more VGPR than necessary. This was saved when
3685 // this was expanded after register allocation.
3688  unsigned InitResultReg, unsigned PhiReg, int Offset,
3689  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3690  MachineFunction *MF = MBB.getParent();
3691  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3692  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3694  const DebugLoc &DL = MI.getDebugLoc();
3696 
3697  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3698  Register DstReg = MI.getOperand(0).getReg();
3699  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3700  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3701  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3702  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3703 
3704  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3705 
3706  // Save the EXEC mask
3707  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3708  .addReg(Exec);
3709 
3710  MachineBasicBlock *LoopBB;
3711  MachineBasicBlock *RemainderBB;
3712  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3713 
3714  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3715 
3716  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3717  InitResultReg, DstReg, PhiReg, TmpExec,
3718  Offset, UseGPRIdxMode, SGPRIdxReg);
3719 
3720  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3722  ++MBBI;
3723  MF->insert(MBBI, LandingPad);
3724  LoopBB->removeSuccessor(RemainderBB);
3725  LandingPad->addSuccessor(RemainderBB);
3726  LoopBB->addSuccessor(LandingPad);
3727  MachineBasicBlock::iterator First = LandingPad->begin();
3728  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3729  .addReg(SaveExec);
3730 
3731  return InsPt;
3732 }
3733 
3734 // Returns subreg index, offset
3735 static std::pair<unsigned, int>
3737  const TargetRegisterClass *SuperRC,
3738  unsigned VecReg,
3739  int Offset) {
3740  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3741 
3742  // Skip out of bounds offsets, or else we would end up using an undefined
3743  // register.
3744  if (Offset >= NumElts || Offset < 0)
3745  return std::make_pair(AMDGPU::sub0, Offset);
3746 
3747  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3748 }
3749 
3752  int Offset) {
3753  MachineBasicBlock *MBB = MI.getParent();
3754  const DebugLoc &DL = MI.getDebugLoc();
3756 
3757  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3758 
3759  assert(Idx->getReg() != AMDGPU::NoRegister);
3760 
3761  if (Offset == 0) {
3762  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3763  } else {
3764  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3765  .add(*Idx)
3766  .addImm(Offset);
3767  }
3768 }
3769 
3772  int Offset) {
3773  MachineBasicBlock *MBB = MI.getParent();
3774  const DebugLoc &DL = MI.getDebugLoc();
3776 
3777  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3778 
3779  if (Offset == 0)
3780  return Idx->getReg();
3781 
3782  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3783  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3784  .add(*Idx)
3785  .addImm(Offset);
3786  return Tmp;
3787 }
3788 
3791  const GCNSubtarget &ST) {
3792  const SIInstrInfo *TII = ST.getInstrInfo();
3793  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3794  MachineFunction *MF = MBB.getParent();
3796 
3797  Register Dst = MI.getOperand(0).getReg();
3798  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3799  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3800  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3801 
3802  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3803  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3804 
3805  unsigned SubReg;
3806  std::tie(SubReg, Offset)
3807  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3808 
3809  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3810 
3811  // Check for a SGPR index.
3812  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3814  const DebugLoc &DL = MI.getDebugLoc();
3815 
3816  if (UseGPRIdxMode) {
3817  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3818  // to avoid interfering with other uses, so probably requires a new
3819  // optimization pass.
3820  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3821 
3822  const MCInstrDesc &GPRIDXDesc =
3823  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3824  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3825  .addReg(SrcReg)
3826  .addReg(Idx)
3827  .addImm(SubReg);
3828  } else {
3829  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3830 
3831  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3832  .addReg(SrcReg, 0, SubReg)
3833  .addReg(SrcReg, RegState::Implicit);
3834  }
3835 
3836  MI.eraseFromParent();
3837 
3838  return &MBB;
3839  }
3840 
3841  // Control flow needs to be inserted if indexing with a VGPR.
3842  const DebugLoc &DL = MI.getDebugLoc();
3844 
3845  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3846  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3847 
3848  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3849 
3850  Register SGPRIdxReg;
3851  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3852  UseGPRIdxMode, SGPRIdxReg);
3853 
3854  MachineBasicBlock *LoopBB = InsPt->getParent();
3855 
3856  if (UseGPRIdxMode) {
3857  const MCInstrDesc &GPRIDXDesc =
3858  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3859 
3860  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3861  .addReg(SrcReg)
3862  .addReg(SGPRIdxReg)
3863  .addImm(SubReg);
3864  } else {
3865  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3866  .addReg(SrcReg, 0, SubReg)
3867  .addReg(SrcReg, RegState::Implicit);
3868  }
3869 
3870  MI.eraseFromParent();
3871 
3872  return LoopBB;
3873 }
3874 
3877  const GCNSubtarget &ST) {
3878  const SIInstrInfo *TII = ST.getInstrInfo();
3879  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3880  MachineFunction *MF = MBB.getParent();
3882 
3883  Register Dst = MI.getOperand(0).getReg();
3884  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3885  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3886  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3887  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3888  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3889  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3890 
3891  // This can be an immediate, but will be folded later.
3892  assert(Val->getReg());
3893 
3894  unsigned SubReg;
3895  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3896  SrcVec->getReg(),
3897  Offset);
3898  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3899 
3900  if (Idx->getReg() == AMDGPU::NoRegister) {
3902  const DebugLoc &DL = MI.getDebugLoc();
3903 
3904  assert(Offset == 0);
3905 
3906  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3907  .add(*SrcVec)
3908  .add(*Val)
3909  .addImm(SubReg);
3910 
3911  MI.eraseFromParent();
3912  return &MBB;
3913  }
3914 
3915  // Check for a SGPR index.
3916  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3918  const DebugLoc &DL = MI.getDebugLoc();
3919 
3920  if (UseGPRIdxMode) {
3921  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3922 
3923  const MCInstrDesc &GPRIDXDesc =
3924  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3925  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3926  .addReg(SrcVec->getReg())
3927  .add(*Val)
3928  .addReg(Idx)
3929  .addImm(SubReg);
3930  } else {
3931  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3932 
3933  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3934  TRI.getRegSizeInBits(*VecRC), 32, false);
3935  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3936  .addReg(SrcVec->getReg())
3937  .add(*Val)
3938  .addImm(SubReg);
3939  }
3940  MI.eraseFromParent();
3941  return &MBB;
3942  }
3943 
3944  // Control flow needs to be inserted if indexing with a VGPR.
3945  if (Val->isReg())
3946  MRI.clearKillFlags(Val->getReg());
3947 
3948  const DebugLoc &DL = MI.getDebugLoc();
3949 
3950  Register PhiReg = MRI.createVirtualRegister(VecRC);
3951 
3952  Register SGPRIdxReg;
3953  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3954  UseGPRIdxMode, SGPRIdxReg);
3955  MachineBasicBlock *LoopBB = InsPt->getParent();
3956 
3957  if (UseGPRIdxMode) {
3958  const MCInstrDesc &GPRIDXDesc =
3959  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3960 
3961  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3962  .addReg(PhiReg)
3963  .add(*Val)
3964  .addReg(SGPRIdxReg)
3965  .addImm(AMDGPU::sub0);
3966  } else {
3967  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3968  TRI.getRegSizeInBits(*VecRC), 32, false);
3969  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3970  .addReg(PhiReg)
3971  .add(*Val)
3972  .addImm(AMDGPU::sub0);
3973  }
3974 
3975  MI.eraseFromParent();
3976  return LoopBB;
3977 }
3978 
3980  MachineInstr &MI, MachineBasicBlock *BB) const {
3981 
3982  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3983  MachineFunction *MF = BB->getParent();
3985 
3986  switch (MI.getOpcode()) {
3987  case AMDGPU::S_UADDO_PSEUDO:
3988  case AMDGPU::S_USUBO_PSEUDO: {
3989  const DebugLoc &DL = MI.getDebugLoc();
3990  MachineOperand &Dest0 = MI.getOperand(0);
3991  MachineOperand &Dest1 = MI.getOperand(1);
3992  MachineOperand &Src0 = MI.getOperand(2);
3993  MachineOperand &Src1 = MI.getOperand(3);
3994 
3995  unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3996  ? AMDGPU::S_ADD_I32
3997  : AMDGPU::S_SUB_I32;
3998  BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3999 
4000  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4001  .addImm(1)
4002  .addImm(0);
4003 
4004  MI.eraseFromParent();
4005  return BB;
4006  }
4007  case AMDGPU::S_ADD_U64_PSEUDO:
4008  case AMDGPU::S_SUB_U64_PSEUDO: {
4009  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4010  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4011  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4012  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4013  const DebugLoc &DL = MI.getDebugLoc();
4014 
4015  MachineOperand &Dest = MI.getOperand(0);
4016  MachineOperand &Src0 = MI.getOperand(1);
4017  MachineOperand &Src1 = MI.getOperand(2);
4018 
4019  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4020  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4021 
4022  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4023  MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4024  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4025  MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4026 
4027  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4028  MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4029  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4030  MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4031 
4032  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4033 
4034  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4035  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4036  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4037  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4038  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4039  .addReg(DestSub0)
4040  .addImm(AMDGPU::sub0)
4041  .addReg(DestSub1)
4042  .addImm(AMDGPU::sub1);
4043  MI.eraseFromParent();
4044  return BB;
4045  }
4046  case AMDGPU::V_ADD_U64_PSEUDO:
4047  case AMDGPU::V_SUB_U64_PSEUDO: {
4048  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4049  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4050  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4051  const DebugLoc &DL = MI.getDebugLoc();
4052 
4053  bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4054 
4055  MachineOperand &Dest = MI.getOperand(0);
4056  MachineOperand &Src0 = MI.getOperand(1);
4057  MachineOperand &Src1 = MI.getOperand(2);
4058 
4059  if (IsAdd && ST.hasLshlAddB64()) {
4060  auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4061  Dest.getReg())
4062  .add(Src0)
4063  .addImm(0)
4064  .add(Src1);
4065  TII->legalizeOperands(*Add);
4066  MI.eraseFromParent();
4067  return BB;
4068  }
4069 
4070  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4071 
4072  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4073  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4074 
4075  Register CarryReg = MRI.createVirtualRegister(CarryRC);
4076  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4077 
4078  const TargetRegisterClass *Src0RC = Src0.isReg()
4079  ? MRI.getRegClass(Src0.getReg())
4080  : &AMDGPU::VReg_64RegClass;
4081  const TargetRegisterClass *Src1RC = Src1.isReg()
4082  ? MRI.getRegClass(Src1.getReg())
4083  : &AMDGPU::VReg_64RegClass;
4084 
4085  const TargetRegisterClass *Src0SubRC =
4086  TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4087  const TargetRegisterClass *Src1SubRC =
4088  TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4089 
4090  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4091  MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4092  MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4093  MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4094 
4095  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4096  MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4097  MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4098  MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4099 
4100  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4101  MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4102  .addReg(CarryReg, RegState::Define)
4103  .add(SrcReg0Sub0)
4104  .add(SrcReg1Sub0)
4105  .addImm(0); // clamp bit
4106 
4107  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4108  MachineInstr *HiHalf =
4109  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4110  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4111  .add(SrcReg0Sub1)
4112  .add(SrcReg1Sub1)
4113  .addReg(CarryReg, RegState::Kill)
4114  .addImm(0); // clamp bit
4115 
4116  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4117  .addReg(DestSub0)
4118  .addImm(AMDGPU::sub0)
4119  .addReg(DestSub1)
4120  .addImm(AMDGPU::sub1);
4121  TII->legalizeOperands(*LoHalf);
4122  TII->legalizeOperands(*HiHalf);
4123  MI.eraseFromParent();
4124  return BB;
4125  }
4126  case AMDGPU::S_ADD_CO_PSEUDO:
4127  case AMDGPU::S_SUB_CO_PSEUDO: {
4128  // This pseudo has a chance to be selected
4129  // only from uniform add/subcarry node. All the VGPR operands
4130  // therefore assumed to be splat vectors.
4131  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4132  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4133  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4135  const DebugLoc &DL = MI.getDebugLoc();
4136  MachineOperand &Dest = MI.getOperand(0);
4137  MachineOperand &CarryDest = MI.getOperand(1);
4138  MachineOperand &Src0 = MI.getOperand(2);
4139  MachineOperand &Src1 = MI.getOperand(3);
4140  MachineOperand &Src2 = MI.getOperand(4);
4141  unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4142  ? AMDGPU::S_ADDC_U32
4143  : AMDGPU::S_SUBB_U32;
4144  if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4145  Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4146  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4147  .addReg(Src0.getReg());
4148  Src0.setReg(RegOp0);
4149  }
4150  if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4151  Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4152  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4153  .addReg(Src1.getReg());
4154  Src1.setReg(RegOp1);
4155  }
4156  Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4157  if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4158  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4159  .addReg(Src2.getReg());
4160  Src2.setReg(RegOp2);
4161  }
4162 
4163  const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4164  unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4165  assert(WaveSize == 64 || WaveSize == 32);
4166 
4167  if (WaveSize == 64) {
4168  if (ST.hasScalarCompareEq64()) {
4169  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4170  .addReg(Src2.getReg())
4171  .addImm(0);
4172  } else {
4173  const TargetRegisterClass *SubRC =
4174  TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4175  MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4176  MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4177  MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4178  MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4179  Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4180 
4181  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4182  .add(Src2Sub0)
4183  .add(Src2Sub1);
4184 
4185  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4186  .addReg(Src2_32, RegState::Kill)
4187  .addImm(0);
4188  }
4189  } else {
4190  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
4191  .addReg(Src2.getReg())
4192  .addImm(0);
4193  }
4194 
4195  BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4196 
4197