LLVM  15.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
21 #include "llvm/ADT/Statistic.h"
24 #include "llvm/BinaryFormat/ELF.h"
25 #include "llvm/CodeGen/Analysis.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #include "llvm/Support/KnownBits.h"
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-lower"
42 
43 STATISTIC(NumTailCalls, "Number of tail calls");
44 
46  "amdgpu-disable-loop-alignment",
47  cl::desc("Do not align and prefetch loops"),
48  cl::init(false));
49 
51  "amdgpu-use-divergent-register-indexing",
52  cl::Hidden,
53  cl::desc("Use indirect register addressing for divergent indexes"),
54  cl::init(false));
55 
56 static bool hasFP32Denormals(const MachineFunction &MF) {
58  return Info->getMode().allFP32Denormals();
59 }
60 
61 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
63  return Info->getMode().allFP64FP16Denormals();
64 }
65 
66 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
67  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
68  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
69  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
70  return AMDGPU::SGPR0 + Reg;
71  }
72  }
73  llvm_unreachable("Cannot allocate sgpr");
74 }
75 
77  const GCNSubtarget &STI)
78  : AMDGPUTargetLowering(TM, STI),
79  Subtarget(&STI) {
80  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
81  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
82 
83  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
84  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
85 
86  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
87 
88  const SIRegisterInfo *TRI = STI.getRegisterInfo();
89  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
90 
91  addRegisterClass(MVT::f64, V64RegClass);
92  addRegisterClass(MVT::v2f32, V64RegClass);
93 
94  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
95  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
96 
97  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
98  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
99 
100  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
101  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
102 
103  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
104  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
105 
106  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
107  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
108 
109  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
110  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
111 
112  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
113  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
114 
115  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
116  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
117 
118  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
119  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
120 
121  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
122  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
123 
124  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
125  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
126 
127  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
128  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
129 
130  if (Subtarget->has16BitInsts()) {
131  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
132  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
133 
134  // Unless there are also VOP3P operations, not operations are really legal.
135  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
136  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
137  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
138  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
139  addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
140  addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
141  }
142 
143  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
144  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
145 
147 
148  // The boolean content concept here is too inflexible. Compares only ever
149  // really produce a 1-bit result. Any copy/extend from these will turn into a
150  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
151  // it's what most targets use.
154 
155  // We need to custom lower vector stores from local memory
159  MVT::v32i32},
160  Custom);
161 
165  MVT::v32i32},
166  Custom);
167 
184 
192 
194 
199 
202 
206 
210  Expand);
214  Expand);
215 
219  Custom);
220 
224 
226 
228 
230  Expand);
231 
232 #if 0
234 #endif
235 
236  // We only support LOAD/STORE and vector manipulation ops for vectors
237  // with > 4 elements.
244  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
245  switch (Op) {
246  case ISD::LOAD:
247  case ISD::STORE:
248  case ISD::BUILD_VECTOR:
249  case ISD::BITCAST:
254  break;
256  case ISD::CONCAT_VECTORS:
258  break;
259  default:
261  break;
262  }
263  }
264  }
265 
267 
268  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
269  // is expanded to avoid having two separate loops in case the index is a VGPR.
270 
271  // Most operations are naturally 32-bit vector operations. We only support
272  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
273  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
276 
279 
282 
285  }
286 
287  for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
290 
293 
296 
299  }
300 
301  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
304 
307 
310 
313  }
314 
315  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
318 
321 
324 
327  }
328 
329  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
332 
335 
338 
341  }
342 
345  Expand);
346 
348 
349  // Avoid stack access for these.
350  // TODO: Generalize to more vector types.
354  Custom);
355 
356  // Deal with vec3 vector operations when widened to vec4.
359 
360  // Deal with vec5/6/7 vector operations when widened to vec8.
364  Custom);
365 
366  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
367  // and output demarshalling
369 
370  // We can't return success/failure, only the old value,
371  // let LLVM add the comparison
373  Expand);
374 
375  if (Subtarget->hasFlatAddressSpace())
377 
379 
380  // FIXME: This should be narrowed to i32, but that only happens if i64 is
381  // illegal.
382  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
384 
385  // On SI this is s_memtime and s_memrealtime on VI.
388 
389  if (Subtarget->has16BitInsts()) {
392  }
393 
394  if (Subtarget->hasMadMacF32Insts())
396 
397  if (!Subtarget->hasBFI())
398  // fcopysign can be done in a single instruction with BFI.
400 
401  if (!Subtarget->hasBCNT(32))
403 
404  if (!Subtarget->hasBCNT(64))
406 
407  if (Subtarget->hasFFBH())
409 
410  if (Subtarget->hasFFBL())
412 
413  // We only really have 32-bit BFE instructions (and 16-bit on VI).
414  //
415  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
416  // effort to match them now. We want this to be false for i64 cases when the
417  // extraction isn't restricted to the upper or lower half. Ideally we would
418  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
419  // span the midpoint are probably relatively rare, so don't worry about them
420  // for now.
421  if (Subtarget->hasBFE())
422  setHasExtractBitsInsn(true);
423 
424  // Clamp modifier on add/sub
425  if (Subtarget->hasIntClamp())
427 
428  if (Subtarget->hasAddNoCarry())
430  Legal);
431 
433  Custom);
434 
435  // These are really only legal for ieee_mode functions. We should be avoiding
436  // them for functions that don't have ieee_mode enabled, so just say they are
437  // legal.
439  {MVT::f32, MVT::f64}, Legal);
440 
441  if (Subtarget->haveRoundOpsF64())
443  else
445  MVT::f64, Custom);
446 
448 
451 
452  if (Subtarget->has16BitInsts()) {
455  MVT::i16, Legal);
456 
458 
460  MVT::i16, Expand);
461 
465  ISD::CTPOP},
466  MVT::i16, Promote);
467 
469 
471 
476 
478 
479  // F16 - Constant Actions.
481 
482  // F16 - Load/Store Actions.
487 
488  // F16 - VOP1 Actions.
491  MVT::f16, Custom);
492 
494 
497  MVT::f16, Promote);
498 
499  // F16 - VOP2 Actions.
501 
503 
504  // F16 - VOP3 Actions.
506  if (STI.hasMadF16())
508 
510  MVT::v8f16}) {
511  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
512  switch (Op) {
513  case ISD::LOAD:
514  case ISD::STORE:
515  case ISD::BUILD_VECTOR:
516  case ISD::BITCAST:
522  break;
523  case ISD::CONCAT_VECTORS:
525  break;
526  default:
528  break;
529  }
530  }
531  }
532 
533  // v_perm_b32 can handle either of these.
536 
537  // XXX - Do these do anything? Vector constants turn into build_vector.
539 
541 
546 
551 
558 
563 
568 
573 
578 
583 
585  MVT::v2i32, Expand);
587 
589  MVT::v4i32, Expand);
590 
592  MVT::v8i32, Expand);
593 
594  if (!Subtarget->hasVOP3PInsts())
596 
598  // This isn't really legal, but this avoids the legalizer unrolling it (and
599  // allows matching fneg (fabs x) patterns)
601 
604 
607 
609  Expand);
610 
611  for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
614  Vec16, Custom);
616  }
617  }
618 
619  if (Subtarget->hasVOP3PInsts()) {
623  MVT::v2i16, Legal);
624 
627  MVT::v2f16, Legal);
628 
630  Custom);
631 
634  Custom);
635 
636  for (MVT VT : {MVT::v4i16, MVT::v8i16})
637  // Split vector operations.
641  ISD::SSUBSAT},
642  VT, Custom);
643 
644  for (MVT VT : {MVT::v4f16, MVT::v8f16})
645  // Split vector operations.
647  VT, Custom);
648 
650  Custom);
651 
654 
655  if (Subtarget->hasPackedFP32Ops()) {
657  MVT::v2f32, Legal);
660  Custom);
661  }
662  }
663 
665 
666  if (Subtarget->has16BitInsts()) {
671  } else {
672  // Legalization hack.
674 
676  }
677 
681  Custom);
682 
684 
685  if (Subtarget->hasMad64_32())
687 
691  Custom);
692 
696  MVT::i16, MVT::i8},
697  Custom);
698 
702  MVT::i8},
703  Custom);
704 
707  ISD::SUB,
709  ISD::FADD,
710  ISD::FSUB,
711  ISD::FMINNUM,
712  ISD::FMAXNUM,
715  ISD::FMA,
716  ISD::SMIN,
717  ISD::SMAX,
718  ISD::UMIN,
719  ISD::UMAX,
720  ISD::SETCC,
721  ISD::AND,
722  ISD::OR,
723  ISD::XOR,
732 
733  // All memory operations. Some folding on the pointer operand is done to help
734  // matching the constant offsets in the addressing modes.
736  ISD::STORE,
755 
756  // FIXME: In other contexts we pretend this is a per-function property.
757  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
758 
760 }
761 
763  return Subtarget;
764 }
765 
766 //===----------------------------------------------------------------------===//
767 // TargetLowering queries
768 //===----------------------------------------------------------------------===//
769 
770 // v_mad_mix* support a conversion from f16 to f32.
771 //
772 // There is only one special case when denormals are enabled we don't currently,
773 // where this is OK to use.
774 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
775  EVT DestVT, EVT SrcVT) const {
776  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
777  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
778  DestVT.getScalarType() == MVT::f32 &&
779  SrcVT.getScalarType() == MVT::f16 &&
780  // TODO: This probably only requires no input flushing?
782 }
783 
784 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
785  LLT DestTy, LLT SrcTy) const {
786  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
787  (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
788  DestTy.getScalarSizeInBits() == 32 &&
789  SrcTy.getScalarSizeInBits() == 16 &&
790  // TODO: This probably only requires no input flushing?
791  !hasFP32Denormals(*MI.getMF());
792 }
793 
795  // SI has some legal vector types, but no legal vector operations. Say no
796  // shuffles are legal in order to prefer scalarizing some vector operations.
797  return false;
798 }
799 
801  CallingConv::ID CC,
802  EVT VT) const {
803  if (CC == CallingConv::AMDGPU_KERNEL)
805 
806  if (VT.isVector()) {
807  EVT ScalarVT = VT.getScalarType();
808  unsigned Size = ScalarVT.getSizeInBits();
809  if (Size == 16) {
810  if (Subtarget->has16BitInsts())
811  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
812  return VT.isInteger() ? MVT::i32 : MVT::f32;
813  }
814 
815  if (Size < 16)
816  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
817  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
818  }
819 
820  if (VT.getSizeInBits() > 32)
821  return MVT::i32;
822 
824 }
825 
827  CallingConv::ID CC,
828  EVT VT) const {
829  if (CC == CallingConv::AMDGPU_KERNEL)
831 
832  if (VT.isVector()) {
833  unsigned NumElts = VT.getVectorNumElements();
834  EVT ScalarVT = VT.getScalarType();
835  unsigned Size = ScalarVT.getSizeInBits();
836 
837  // FIXME: Should probably promote 8-bit vectors to i16.
838  if (Size == 16 && Subtarget->has16BitInsts())
839  return (NumElts + 1) / 2;
840 
841  if (Size <= 32)
842  return NumElts;
843 
844  if (Size > 32)
845  return NumElts * ((Size + 31) / 32);
846  } else if (VT.getSizeInBits() > 32)
847  return (VT.getSizeInBits() + 31) / 32;
848 
850 }
851 
854  EVT VT, EVT &IntermediateVT,
855  unsigned &NumIntermediates, MVT &RegisterVT) const {
856  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
857  unsigned NumElts = VT.getVectorNumElements();
858  EVT ScalarVT = VT.getScalarType();
859  unsigned Size = ScalarVT.getSizeInBits();
860  // FIXME: We should fix the ABI to be the same on targets without 16-bit
861  // support, but unless we can properly handle 3-vectors, it will be still be
862  // inconsistent.
863  if (Size == 16 && Subtarget->has16BitInsts()) {
864  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
865  IntermediateVT = RegisterVT;
866  NumIntermediates = (NumElts + 1) / 2;
867  return NumIntermediates;
868  }
869 
870  if (Size == 32) {
871  RegisterVT = ScalarVT.getSimpleVT();
872  IntermediateVT = RegisterVT;
873  NumIntermediates = NumElts;
874  return NumIntermediates;
875  }
876 
877  if (Size < 16 && Subtarget->has16BitInsts()) {
878  // FIXME: Should probably form v2i16 pieces
879  RegisterVT = MVT::i16;
880  IntermediateVT = ScalarVT;
881  NumIntermediates = NumElts;
882  return NumIntermediates;
883  }
884 
885 
886  if (Size != 16 && Size <= 32) {
887  RegisterVT = MVT::i32;
888  IntermediateVT = ScalarVT;
889  NumIntermediates = NumElts;
890  return NumIntermediates;
891  }
892 
893  if (Size > 32) {
894  RegisterVT = MVT::i32;
895  IntermediateVT = RegisterVT;
896  NumIntermediates = NumElts * ((Size + 31) / 32);
897  return NumIntermediates;
898  }
899  }
900 
902  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
903 }
904 
905 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
906  assert(DMaskLanes != 0);
907 
908  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
909  unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
910  return EVT::getVectorVT(Ty->getContext(),
911  EVT::getEVT(VT->getElementType()),
912  NumElts);
913  }
914 
915  return EVT::getEVT(Ty);
916 }
917 
918 // Peek through TFE struct returns to only use the data size.
919 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
920  auto *ST = dyn_cast<StructType>(Ty);
921  if (!ST)
922  return memVTFromImageData(Ty, DMaskLanes);
923 
924  // Some intrinsics return an aggregate type - special case to work out the
925  // correct memVT.
926  //
927  // Only limited forms of aggregate type currently expected.
928  if (ST->getNumContainedTypes() != 2 ||
929  !ST->getContainedType(1)->isIntegerTy(32))
930  return EVT();
931  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
932 }
933 
935  const CallInst &CI,
936  MachineFunction &MF,
937  unsigned IntrID) const {
939  if (CI.hasMetadata(LLVMContext::MD_invariant_load))
941 
942  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
943  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
945  (Intrinsic::ID)IntrID);
946  if (Attr.hasFnAttr(Attribute::ReadNone))
947  return false;
948 
950 
951  const GCNTargetMachine &TM =
952  static_cast<const GCNTargetMachine &>(getTargetMachine());
953 
954  if (RsrcIntr->IsImage) {
955  Info.ptrVal = MFI->getImagePSV(TM);
956  Info.align.reset();
957  } else {
958  Info.ptrVal = MFI->getBufferPSV(TM);
959  }
960 
962  if (Attr.hasFnAttr(Attribute::ReadOnly)) {
963  unsigned DMaskLanes = 4;
964 
965  if (RsrcIntr->IsImage) {
968  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
969  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
970 
971  if (!BaseOpcode->Gather4) {
972  // If this isn't a gather, we may have excess loaded elements in the
973  // IR type. Check the dmask for the real number of elements loaded.
974  unsigned DMask
975  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
976  DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
977  }
978 
979  Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
980  } else
981  Info.memVT = EVT::getEVT(CI.getType());
982 
983  // FIXME: What does alignment mean for an image?
986  } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
988 
989  Type *DataTy = CI.getArgOperand(0)->getType();
990  if (RsrcIntr->IsImage) {
991  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
992  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
993  Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
994  } else
995  Info.memVT = EVT::getEVT(DataTy);
996 
998  } else {
999  // Atomic
1000  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1002  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1003  Info.flags |= MachineMemOperand::MOLoad |
1006 
1007  // XXX - Should this be volatile without known ordering?
1009 
1010  switch (IntrID) {
1011  default:
1012  break;
1013  case Intrinsic::amdgcn_raw_buffer_load_lds:
1014  case Intrinsic::amdgcn_struct_buffer_load_lds: {
1015  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1016  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1017  return true;
1018  }
1019  }
1020  }
1021  return true;
1022  }
1023 
1024  switch (IntrID) {
1025  case Intrinsic::amdgcn_atomic_inc:
1026  case Intrinsic::amdgcn_atomic_dec:
1027  case Intrinsic::amdgcn_ds_ordered_add:
1028  case Intrinsic::amdgcn_ds_ordered_swap:
1029  case Intrinsic::amdgcn_ds_fadd:
1030  case Intrinsic::amdgcn_ds_fmin:
1031  case Intrinsic::amdgcn_ds_fmax: {
1033  Info.memVT = MVT::getVT(CI.getType());
1034  Info.ptrVal = CI.getOperand(0);
1035  Info.align.reset();
1037 
1038  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1039  if (!Vol->isZero())
1041 
1042  return true;
1043  }
1044  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1046 
1047  const GCNTargetMachine &TM =
1048  static_cast<const GCNTargetMachine &>(getTargetMachine());
1049 
1051  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1052  Info.ptrVal = MFI->getBufferPSV(TM);
1053  Info.align.reset();
1055 
1056  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1057  if (!Vol || !Vol->isZero())
1059 
1060  return true;
1061  }
1062  case Intrinsic::amdgcn_ds_append:
1063  case Intrinsic::amdgcn_ds_consume: {
1065  Info.memVT = MVT::getVT(CI.getType());
1066  Info.ptrVal = CI.getOperand(0);
1067  Info.align.reset();
1069 
1070  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1071  if (!Vol->isZero())
1073 
1074  return true;
1075  }
1076  case Intrinsic::amdgcn_global_atomic_csub: {
1078  Info.memVT = MVT::getVT(CI.getType());
1079  Info.ptrVal = CI.getOperand(0);
1080  Info.align.reset();
1081  Info.flags |= MachineMemOperand::MOLoad |
1084  return true;
1085  }
1086  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1089  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1090 
1091  const GCNTargetMachine &TM =
1092  static_cast<const GCNTargetMachine &>(getTargetMachine());
1093 
1094  Info.ptrVal = MFI->getImagePSV(TM);
1095  Info.align.reset();
1096  Info.flags |= MachineMemOperand::MOLoad |
1098  return true;
1099  }
1100  case Intrinsic::amdgcn_global_atomic_fadd:
1101  case Intrinsic::amdgcn_global_atomic_fmin:
1102  case Intrinsic::amdgcn_global_atomic_fmax:
1103  case Intrinsic::amdgcn_flat_atomic_fadd:
1104  case Intrinsic::amdgcn_flat_atomic_fmin:
1105  case Intrinsic::amdgcn_flat_atomic_fmax:
1106  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1107  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1109  Info.memVT = MVT::getVT(CI.getType());
1110  Info.ptrVal = CI.getOperand(0);
1111  Info.align.reset();
1112  Info.flags |= MachineMemOperand::MOLoad |
1116  return true;
1117  }
1118  case Intrinsic::amdgcn_ds_gws_init:
1119  case Intrinsic::amdgcn_ds_gws_barrier:
1120  case Intrinsic::amdgcn_ds_gws_sema_v:
1121  case Intrinsic::amdgcn_ds_gws_sema_br:
1122  case Intrinsic::amdgcn_ds_gws_sema_p:
1123  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1124  Info.opc = ISD::INTRINSIC_VOID;
1125 
1126  const GCNTargetMachine &TM =
1127  static_cast<const GCNTargetMachine &>(getTargetMachine());
1128 
1130  Info.ptrVal = MFI->getGWSPSV(TM);
1131 
1132  // This is an abstract access, but we need to specify a type and size.
1133  Info.memVT = MVT::i32;
1134  Info.size = 4;
1135  Info.align = Align(4);
1136 
1137  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1139  else
1141  return true;
1142  }
1143  case Intrinsic::amdgcn_global_load_lds: {
1144  Info.opc = ISD::INTRINSIC_VOID;
1145  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1146  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1149  return true;
1150  }
1151  default:
1152  return false;
1153  }
1154 }
1155 
1158  Type *&AccessTy) const {
1159  switch (II->getIntrinsicID()) {
1160  case Intrinsic::amdgcn_atomic_inc:
1161  case Intrinsic::amdgcn_atomic_dec:
1162  case Intrinsic::amdgcn_ds_ordered_add:
1163  case Intrinsic::amdgcn_ds_ordered_swap:
1164  case Intrinsic::amdgcn_ds_append:
1165  case Intrinsic::amdgcn_ds_consume:
1166  case Intrinsic::amdgcn_ds_fadd:
1167  case Intrinsic::amdgcn_ds_fmin:
1168  case Intrinsic::amdgcn_ds_fmax:
1169  case Intrinsic::amdgcn_global_atomic_fadd:
1170  case Intrinsic::amdgcn_flat_atomic_fadd:
1171  case Intrinsic::amdgcn_flat_atomic_fmin:
1172  case Intrinsic::amdgcn_flat_atomic_fmax:
1173  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1174  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1175  case Intrinsic::amdgcn_global_atomic_csub: {
1176  Value *Ptr = II->getArgOperand(0);
1177  AccessTy = II->getType();
1178  Ops.push_back(Ptr);
1179  return true;
1180  }
1181  default:
1182  return false;
1183  }
1184 }
1185 
1186 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1187  if (!Subtarget->hasFlatInstOffsets()) {
1188  // Flat instructions do not have offsets, and only have the register
1189  // address.
1190  return AM.BaseOffs == 0 && AM.Scale == 0;
1191  }
1192 
1193  return AM.Scale == 0 &&
1194  (AM.BaseOffs == 0 ||
1195  Subtarget->getInstrInfo()->isLegalFLATOffset(
1197 }
1198 
1200  if (Subtarget->hasFlatGlobalInsts())
1201  return AM.Scale == 0 &&
1202  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1205 
1206  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1207  // Assume the we will use FLAT for all global memory accesses
1208  // on VI.
1209  // FIXME: This assumption is currently wrong. On VI we still use
1210  // MUBUF instructions for the r + i addressing mode. As currently
1211  // implemented, the MUBUF instructions only work on buffer < 4GB.
1212  // It may be possible to support > 4GB buffers with MUBUF instructions,
1213  // by setting the stride value in the resource descriptor which would
1214  // increase the size limit to (stride * 4GB). However, this is risky,
1215  // because it has never been validated.
1216  return isLegalFlatAddressingMode(AM);
1217  }
1218 
1219  return isLegalMUBUFAddressingMode(AM);
1220 }
1221 
1222 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1223  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1224  // additionally can do r + r + i with addr64. 32-bit has more addressing
1225  // mode options. Depending on the resource constant, it can also do
1226  // (i64 r0) + (i32 r1) * (i14 i).
1227  //
1228  // Private arrays end up using a scratch buffer most of the time, so also
1229  // assume those use MUBUF instructions. Scratch loads / stores are currently
1230  // implemented as mubuf instructions with offen bit set, so slightly
1231  // different than the normal addr64.
1232  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1233  return false;
1234 
1235  // FIXME: Since we can split immediate into soffset and immediate offset,
1236  // would it make sense to allow any immediate?
1237 
1238  switch (AM.Scale) {
1239  case 0: // r + i or just i, depending on HasBaseReg.
1240  return true;
1241  case 1:
1242  return true; // We have r + r or r + i.
1243  case 2:
1244  if (AM.HasBaseReg) {
1245  // Reject 2 * r + r.
1246  return false;
1247  }
1248 
1249  // Allow 2 * r as r + r
1250  // Or 2 * r + i is allowed as r + r + i.
1251  return true;
1252  default: // Don't allow n * r
1253  return false;
1254  }
1255 }
1256 
1258  const AddrMode &AM, Type *Ty,
1259  unsigned AS, Instruction *I) const {
1260  // No global is ever allowed as a base.
1261  if (AM.BaseGV)
1262  return false;
1263 
1264  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1265  return isLegalGlobalAddressingMode(AM);
1266 
1267  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1270  // If the offset isn't a multiple of 4, it probably isn't going to be
1271  // correctly aligned.
1272  // FIXME: Can we get the real alignment here?
1273  if (AM.BaseOffs % 4 != 0)
1274  return isLegalMUBUFAddressingMode(AM);
1275 
1276  // There are no SMRD extloads, so if we have to do a small type access we
1277  // will use a MUBUF load.
1278  // FIXME?: We also need to do this if unaligned, but we don't know the
1279  // alignment here.
1280  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1281  return isLegalGlobalAddressingMode(AM);
1282 
1283  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1284  // SMRD instructions have an 8-bit, dword offset on SI.
1285  if (!isUInt<8>(AM.BaseOffs / 4))
1286  return false;
1287  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1288  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1289  // in 8-bits, it can use a smaller encoding.
1290  if (!isUInt<32>(AM.BaseOffs / 4))
1291  return false;
1292  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1293  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1294  if (!isUInt<20>(AM.BaseOffs))
1295  return false;
1296  } else
1297  llvm_unreachable("unhandled generation");
1298 
1299  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1300  return true;
1301 
1302  if (AM.Scale == 1 && AM.HasBaseReg)
1303  return true;
1304 
1305  return false;
1306 
1307  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1308  return isLegalMUBUFAddressingMode(AM);
1309  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1310  AS == AMDGPUAS::REGION_ADDRESS) {
1311  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1312  // field.
1313  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1314  // an 8-bit dword offset but we don't know the alignment here.
1315  if (!isUInt<16>(AM.BaseOffs))
1316  return false;
1317 
1318  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1319  return true;
1320 
1321  if (AM.Scale == 1 && AM.HasBaseReg)
1322  return true;
1323 
1324  return false;
1325  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1327  // For an unknown address space, this usually means that this is for some
1328  // reason being used for pure arithmetic, and not based on some addressing
1329  // computation. We don't have instructions that compute pointers with any
1330  // addressing modes, so treat them as having no offset like flat
1331  // instructions.
1332  return isLegalFlatAddressingMode(AM);
1333  }
1334 
1335  // Assume a user alias of global for unknown address spaces.
1336  return isLegalGlobalAddressingMode(AM);
1337 }
1338 
1339 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1340  const MachineFunction &MF) const {
1341  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1342  return (MemVT.getSizeInBits() <= 4 * 32);
1343  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1344  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1345  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1346  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1347  return (MemVT.getSizeInBits() <= 2 * 32);
1348  }
1349  return true;
1350 }
1351 
1353  unsigned Size, unsigned AddrSpace, Align Alignment,
1354  MachineMemOperand::Flags Flags, bool *IsFast) const {
1355  if (IsFast)
1356  *IsFast = false;
1357 
1358  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1359  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1360  // Check if alignment requirements for ds_read/write instructions are
1361  // disabled.
1362  if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1363  return false;
1364 
1365  Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1366  if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1367  Alignment < RequiredAlignment)
1368  return false;
1369 
1370  // Either, the alignment requirements are "enabled", or there is an
1371  // unaligned LDS access related hardware bug though alignment requirements
1372  // are "disabled". In either case, we need to check for proper alignment
1373  // requirements.
1374  //
1375  switch (Size) {
1376  case 64:
1377  // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1378  // address is negative, then the instruction is incorrectly treated as
1379  // out-of-bounds even if base + offsets is in bounds. Split vectorized
1380  // loads here to avoid emitting ds_read2_b32. We may re-combine the
1381  // load later in the SILoadStoreOptimizer.
1382  if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1383  return false;
1384 
1385  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1386  // can do a 4 byte aligned, 8 byte access in a single operation using
1387  // ds_read2/write2_b32 with adjacent offsets.
1388  RequiredAlignment = Align(4);
1389 
1390  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1391  // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1392  // ds_write2_b32 depending on the alignment. In either case with either
1393  // alignment there is no faster way of doing this.
1394  if (IsFast)
1395  *IsFast = true;
1396  return true;
1397  }
1398 
1399  break;
1400  case 96:
1401  if (!Subtarget->hasDS96AndDS128())
1402  return false;
1403 
1404  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1405  // gfx8 and older.
1406 
1407  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1408  // Naturally aligned access is fastest. However, also report it is Fast
1409  // if memory is aligned less than DWORD. A narrow load or store will be
1410  // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1411  // be more of them, so overall we will pay less penalty issuing a single
1412  // instruction.
1413  if (IsFast)
1414  *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1415  return true;
1416  }
1417 
1418  break;
1419  case 128:
1420  if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1421  return false;
1422 
1423  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1424  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1425  // single operation using ds_read2/write2_b64.
1426  RequiredAlignment = Align(8);
1427 
1428  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1429  // Naturally aligned access is fastest. However, also report it is Fast
1430  // if memory is aligned less than DWORD. A narrow load or store will be
1431  // be equally slow as a single ds_read_b128/ds_write_b128, but there
1432  // will be more of them, so overall we will pay less penalty issuing a
1433  // single instruction.
1434  if (IsFast)
1435  *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1436  return true;
1437  }
1438 
1439  break;
1440  default:
1441  if (Size > 32)
1442  return false;
1443 
1444  break;
1445  }
1446 
1447  if (IsFast)
1448  *IsFast = Alignment >= RequiredAlignment;
1449 
1450  return Alignment >= RequiredAlignment ||
1451  Subtarget->hasUnalignedDSAccessEnabled();
1452  }
1453 
1454  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1455  bool AlignedBy4 = Alignment >= Align(4);
1456  if (IsFast)
1457  *IsFast = AlignedBy4;
1458 
1459  return AlignedBy4 ||
1460  Subtarget->enableFlatScratch() ||
1461  Subtarget->hasUnalignedScratchAccess();
1462  }
1463 
1464  // FIXME: We have to be conservative here and assume that flat operations
1465  // will access scratch. If we had access to the IR function, then we
1466  // could determine if any private memory was used in the function.
1467  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1468  !Subtarget->hasUnalignedScratchAccess()) {
1469  bool AlignedBy4 = Alignment >= Align(4);
1470  if (IsFast)
1471  *IsFast = AlignedBy4;
1472 
1473  return AlignedBy4;
1474  }
1475 
1476  if (Subtarget->hasUnalignedBufferAccessEnabled()) {
1477  // If we have a uniform constant load, it still requires using a slow
1478  // buffer instruction if unaligned.
1479  if (IsFast) {
1480  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1481  // 2-byte alignment is worse than 1 unless doing a 2-byte access.
1482  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1483  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1484  Alignment >= Align(4) : Alignment != Align(2);
1485  }
1486 
1487  return true;
1488  }
1489 
1490  // Smaller than dword value must be aligned.
1491  if (Size < 32)
1492  return false;
1493 
1494  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1495  // byte-address are ignored, thus forcing Dword alignment.
1496  // This applies to private, global, and constant memory.
1497  if (IsFast)
1498  *IsFast = true;
1499 
1500  return Size >= 32 && Alignment >= Align(4);
1501 }
1502 
1504  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1505  bool *IsFast) const {
1507  Alignment, Flags, IsFast);
1508 
1509  if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
1510  (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1511  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1512  // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
1513  // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
1514  // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
1515  // which would be equally misaligned.
1516  // This is only used by the common passes, selection always calls the
1517  // allowsMisalignedMemoryAccessesImpl version.
1518  *IsFast = true;
1519  }
1520 
1521  return Allow;
1522 }
1523 
1525  const MemOp &Op, const AttributeList &FuncAttributes) const {
1526  // FIXME: Should account for address space here.
1527 
1528  // The default fallback uses the private pointer size as a guess for a type to
1529  // use. Make sure we switch these to 64-bit accesses.
1530 
1531  if (Op.size() >= 16 &&
1532  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1533  return MVT::v4i32;
1534 
1535  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1536  return MVT::v2i32;
1537 
1538  // Use the default.
1539  return MVT::Other;
1540 }
1541 
1543  const MemSDNode *MemNode = cast<MemSDNode>(N);
1544  return MemNode->getMemOperand()->getFlags() & MONoClobber;
1545 }
1546 
1548  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1550 }
1551 
1553  unsigned DestAS) const {
1554  // Flat -> private/local is a simple truncate.
1555  // Flat -> global is no-op
1556  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1557  return true;
1558 
1559  const GCNTargetMachine &TM =
1560  static_cast<const GCNTargetMachine &>(getTargetMachine());
1561  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1562 }
1563 
1565  const MemSDNode *MemNode = cast<MemSDNode>(N);
1566 
1567  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1568 }
1569 
1572  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1576 }
1577 
1579  Type *Ty) const {
1580  // FIXME: Could be smarter if called for vector constants.
1581  return true;
1582 }
1583 
1585  unsigned Index) const {
1587  return false;
1588 
1589  // TODO: Add more cases that are cheap.
1590  return Index == 0;
1591 }
1592 
1594  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1595  switch (Op) {
1596  case ISD::LOAD:
1597  case ISD::STORE:
1598 
1599  // These operations are done with 32-bit instructions anyway.
1600  case ISD::AND:
1601  case ISD::OR:
1602  case ISD::XOR:
1603  case ISD::SELECT:
1604  // TODO: Extensions?
1605  return true;
1606  default:
1607  return false;
1608  }
1609  }
1610 
1611  // SimplifySetCC uses this function to determine whether or not it should
1612  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1613  if (VT == MVT::i1 && Op == ISD::SETCC)
1614  return false;
1615 
1617 }
1618 
1619 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1620  const SDLoc &SL,
1621  SDValue Chain,
1622  uint64_t Offset) const {
1623  const DataLayout &DL = DAG.getDataLayout();
1624  MachineFunction &MF = DAG.getMachineFunction();
1626 
1627  const ArgDescriptor *InputPtrReg;
1628  const TargetRegisterClass *RC;
1629  LLT ArgTy;
1631 
1632  std::tie(InputPtrReg, RC, ArgTy) =
1634 
1635  // We may not have the kernarg segment argument if we have no kernel
1636  // arguments.
1637  if (!InputPtrReg)
1638  return DAG.getConstant(0, SL, PtrVT);
1639 
1641  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1642  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1643 
1644  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1645 }
1646 
1647 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1648  const SDLoc &SL) const {
1650  FIRST_IMPLICIT);
1651  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1652 }
1653 
1654 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1655  const SDLoc &SL, SDValue Val,
1656  bool Signed,
1657  const ISD::InputArg *Arg) const {
1658  // First, if it is a widened vector, narrow it.
1659  if (VT.isVector() &&
1660  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1661  EVT NarrowedVT =
1663  VT.getVectorNumElements());
1664  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1665  DAG.getConstant(0, SL, MVT::i32));
1666  }
1667 
1668  // Then convert the vector elements or scalar value.
1669  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1670  VT.bitsLT(MemVT)) {
1671  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1672  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1673  }
1674 
1675  if (MemVT.isFloatingPoint())
1676  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1677  else if (Signed)
1678  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1679  else
1680  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1681 
1682  return Val;
1683 }
1684 
1685 SDValue SITargetLowering::lowerKernargMemParameter(
1686  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1687  uint64_t Offset, Align Alignment, bool Signed,
1688  const ISD::InputArg *Arg) const {
1690 
1691  // Try to avoid using an extload by loading earlier than the argument address,
1692  // and extracting the relevant bits. The load should hopefully be merged with
1693  // the previous argument.
1694  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1695  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1696  int64_t AlignDownOffset = alignDown(Offset, 4);
1697  int64_t OffsetDiff = Offset - AlignDownOffset;
1698 
1699  EVT IntVT = MemVT.changeTypeToInteger();
1700 
1701  // TODO: If we passed in the base kernel offset we could have a better
1702  // alignment than 4, but we don't really need it.
1703  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1704  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1707 
1708  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1709  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1710 
1711  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1712  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1713  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1714 
1715 
1716  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1717  }
1718 
1719  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1720  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1723 
1724  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1725  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1726 }
1727 
1728 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1729  const SDLoc &SL, SDValue Chain,
1730  const ISD::InputArg &Arg) const {
1731  MachineFunction &MF = DAG.getMachineFunction();
1732  MachineFrameInfo &MFI = MF.getFrameInfo();
1733 
1734  if (Arg.Flags.isByVal()) {
1735  unsigned Size = Arg.Flags.getByValSize();
1736  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1737  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1738  }
1739 
1740  unsigned ArgOffset = VA.getLocMemOffset();
1741  unsigned ArgSize = VA.getValVT().getStoreSize();
1742 
1743  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1744 
1745  // Create load nodes to retrieve arguments from the stack.
1746  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1747  SDValue ArgValue;
1748 
1749  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1751  MVT MemVT = VA.getValVT();
1752 
1753  switch (VA.getLocInfo()) {
1754  default:
1755  break;
1756  case CCValAssign::BCvt:
1757  MemVT = VA.getLocVT();
1758  break;
1759  case CCValAssign::SExt:
1760  ExtType = ISD::SEXTLOAD;
1761  break;
1762  case CCValAssign::ZExt:
1763  ExtType = ISD::ZEXTLOAD;
1764  break;
1765  case CCValAssign::AExt:
1766  ExtType = ISD::EXTLOAD;
1767  break;
1768  }
1769 
1770  ArgValue = DAG.getExtLoad(
1771  ExtType, SL, VA.getLocVT(), Chain, FIN,
1773  MemVT);
1774  return ArgValue;
1775 }
1776 
1777 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1778  const SIMachineFunctionInfo &MFI,
1779  EVT VT,
1781  const ArgDescriptor *Reg;
1782  const TargetRegisterClass *RC;
1783  LLT Ty;
1784 
1785  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1786  if (!Reg) {
1787  if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1788  // It's possible for a kernarg intrinsic call to appear in a kernel with
1789  // no allocated segment, in which case we do not add the user sgpr
1790  // argument, so just return null.
1791  return DAG.getConstant(0, SDLoc(), VT);
1792  }
1793 
1794  // It's undefined behavior if a function marked with the amdgpu-no-*
1795  // attributes uses the corresponding intrinsic.
1796  return DAG.getUNDEF(VT);
1797  }
1798 
1799  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1800 }
1801 
1803  CallingConv::ID CallConv,
1805  FunctionType *FType,
1807  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1808  const ISD::InputArg *Arg = &Ins[I];
1809 
1810  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1811  "vector type argument should have been split");
1812 
1813  // First check if it's a PS input addr.
1814  if (CallConv == CallingConv::AMDGPU_PS &&
1815  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1816  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1817 
1818  // Inconveniently only the first part of the split is marked as isSplit,
1819  // so skip to the end. We only want to increment PSInputNum once for the
1820  // entire split argument.
1821  if (Arg->Flags.isSplit()) {
1822  while (!Arg->Flags.isSplitEnd()) {
1823  assert((!Arg->VT.isVector() ||
1824  Arg->VT.getScalarSizeInBits() == 16) &&
1825  "unexpected vector split in ps argument type");
1826  if (!SkipArg)
1827  Splits.push_back(*Arg);
1828  Arg = &Ins[++I];
1829  }
1830  }
1831 
1832  if (SkipArg) {
1833  // We can safely skip PS inputs.
1834  Skipped.set(Arg->getOrigArgIndex());
1835  ++PSInputNum;
1836  continue;
1837  }
1838 
1839  Info->markPSInputAllocated(PSInputNum);
1840  if (Arg->Used)
1841  Info->markPSInputEnabled(PSInputNum);
1842 
1843  ++PSInputNum;
1844  }
1845 
1846  Splits.push_back(*Arg);
1847  }
1848 }
1849 
1850 // Allocate special inputs passed in VGPRs.
1852  MachineFunction &MF,
1853  const SIRegisterInfo &TRI,
1854  SIMachineFunctionInfo &Info) const {
1855  const LLT S32 = LLT::scalar(32);
1857 
1858  if (Info.hasWorkItemIDX()) {
1859  Register Reg = AMDGPU::VGPR0;
1860  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1861 
1862  CCInfo.AllocateReg(Reg);
1863  unsigned Mask = (Subtarget->hasPackedTID() &&
1864  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1865  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1866  }
1867 
1868  if (Info.hasWorkItemIDY()) {
1869  assert(Info.hasWorkItemIDX());
1870  if (Subtarget->hasPackedTID()) {
1871  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1872  0x3ff << 10));
1873  } else {
1874  unsigned Reg = AMDGPU::VGPR1;
1875  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1876 
1877  CCInfo.AllocateReg(Reg);
1878  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1879  }
1880  }
1881 
1882  if (Info.hasWorkItemIDZ()) {
1883  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1884  if (Subtarget->hasPackedTID()) {
1885  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1886  0x3ff << 20));
1887  } else {
1888  unsigned Reg = AMDGPU::VGPR2;
1889  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1890 
1891  CCInfo.AllocateReg(Reg);
1892  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1893  }
1894  }
1895 }
1896 
1897 // Try to allocate a VGPR at the end of the argument list, or if no argument
1898 // VGPRs are left allocating a stack slot.
1899 // If \p Mask is is given it indicates bitfield position in the register.
1900 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1901 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1903  if (Arg.isSet())
1905 
1906  ArrayRef<MCPhysReg> ArgVGPRs
1907  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1908  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1909  if (RegIdx == ArgVGPRs.size()) {
1910  // Spill to stack required.
1911  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1912 
1913  return ArgDescriptor::createStack(Offset, Mask);
1914  }
1915 
1916  unsigned Reg = ArgVGPRs[RegIdx];
1917  Reg = CCInfo.AllocateReg(Reg);
1918  assert(Reg != AMDGPU::NoRegister);
1919 
1920  MachineFunction &MF = CCInfo.getMachineFunction();
1921  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1922  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1924 }
1925 
1927  const TargetRegisterClass *RC,
1928  unsigned NumArgRegs) {
1929  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1930  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1931  if (RegIdx == ArgSGPRs.size())
1932  report_fatal_error("ran out of SGPRs for arguments");
1933 
1934  unsigned Reg = ArgSGPRs[RegIdx];
1935  Reg = CCInfo.AllocateReg(Reg);
1936  assert(Reg != AMDGPU::NoRegister);
1937 
1938  MachineFunction &MF = CCInfo.getMachineFunction();
1939  MF.addLiveIn(Reg, RC);
1941 }
1942 
1943 // If this has a fixed position, we still should allocate the register in the
1944 // CCInfo state. Technically we could get away with this for values passed
1945 // outside of the normal argument range.
1947  const TargetRegisterClass *RC,
1948  MCRegister Reg) {
1949  Reg = CCInfo.AllocateReg(Reg);
1950  assert(Reg != AMDGPU::NoRegister);
1951  MachineFunction &MF = CCInfo.getMachineFunction();
1952  MF.addLiveIn(Reg, RC);
1953 }
1954 
1956  if (Arg) {
1957  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1958  Arg.getRegister());
1959  } else
1960  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1961 }
1962 
1964  if (Arg) {
1965  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
1966  Arg.getRegister());
1967  } else
1968  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1969 }
1970 
1971 /// Allocate implicit function VGPR arguments at the end of allocated user
1972 /// arguments.
1974  CCState &CCInfo, MachineFunction &MF,
1975  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1976  const unsigned Mask = 0x3ff;
1978 
1979  if (Info.hasWorkItemIDX()) {
1980  Arg = allocateVGPR32Input(CCInfo, Mask);
1981  Info.setWorkItemIDX(Arg);
1982  }
1983 
1984  if (Info.hasWorkItemIDY()) {
1985  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
1986  Info.setWorkItemIDY(Arg);
1987  }
1988 
1989  if (Info.hasWorkItemIDZ())
1990  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
1991 }
1992 
1993 /// Allocate implicit function VGPR arguments in fixed registers.
1995  CCState &CCInfo, MachineFunction &MF,
1996  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1997  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
1998  if (!Reg)
1999  report_fatal_error("failed to allocated VGPR for implicit arguments");
2000 
2001  const unsigned Mask = 0x3ff;
2002  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2003  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2004  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2005 }
2006 
2008  CCState &CCInfo,
2009  MachineFunction &MF,
2010  const SIRegisterInfo &TRI,
2011  SIMachineFunctionInfo &Info) const {
2012  auto &ArgInfo = Info.getArgInfo();
2013 
2014  // TODO: Unify handling with private memory pointers.
2015  if (Info.hasDispatchPtr())
2016  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2017 
2018  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
2019  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2020 
2021  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2022  // constant offset from the kernarg segment.
2023  if (Info.hasImplicitArgPtr())
2024  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2025 
2026  if (Info.hasDispatchID())
2027  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2028 
2029  // flat_scratch_init is not applicable for non-kernel functions.
2030 
2031  if (Info.hasWorkGroupIDX())
2032  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2033 
2034  if (Info.hasWorkGroupIDY())
2035  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2036 
2037  if (Info.hasWorkGroupIDZ())
2038  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2039 }
2040 
2041 // Allocate special inputs passed in user SGPRs.
2043  MachineFunction &MF,
2044  const SIRegisterInfo &TRI,
2045  SIMachineFunctionInfo &Info) const {
2046  if (Info.hasImplicitBufferPtr()) {
2047  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2048  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2049  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2050  }
2051 
2052  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2053  if (Info.hasPrivateSegmentBuffer()) {
2054  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2055  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2056  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2057  }
2058 
2059  if (Info.hasDispatchPtr()) {
2060  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2061  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2062  CCInfo.AllocateReg(DispatchPtrReg);
2063  }
2064 
2065  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
2066  Register QueuePtrReg = Info.addQueuePtr(TRI);
2067  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2068  CCInfo.AllocateReg(QueuePtrReg);
2069  }
2070 
2071  if (Info.hasKernargSegmentPtr()) {
2073  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2074  CCInfo.AllocateReg(InputPtrReg);
2075 
2076  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2078  }
2079 
2080  if (Info.hasDispatchID()) {
2081  Register DispatchIDReg = Info.addDispatchID(TRI);
2082  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2083  CCInfo.AllocateReg(DispatchIDReg);
2084  }
2085 
2086  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2087  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2088  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2089  CCInfo.AllocateReg(FlatScratchInitReg);
2090  }
2091 
2092  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2093  // these from the dispatch pointer.
2094 }
2095 
2096 // Allocate special input registers that are initialized per-wave.
2098  MachineFunction &MF,
2100  CallingConv::ID CallConv,
2101  bool IsShader) const {
2102  if (Info.hasWorkGroupIDX()) {
2103  Register Reg = Info.addWorkGroupIDX();
2104  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2105  CCInfo.AllocateReg(Reg);
2106  }
2107 
2108  if (Info.hasWorkGroupIDY()) {
2109  Register Reg = Info.addWorkGroupIDY();
2110  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2111  CCInfo.AllocateReg(Reg);
2112  }
2113 
2114  if (Info.hasWorkGroupIDZ()) {
2115  Register Reg = Info.addWorkGroupIDZ();
2116  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2117  CCInfo.AllocateReg(Reg);
2118  }
2119 
2120  if (Info.hasWorkGroupInfo()) {
2121  Register Reg = Info.addWorkGroupInfo();
2122  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2123  CCInfo.AllocateReg(Reg);
2124  }
2125 
2126  if (Info.hasPrivateSegmentWaveByteOffset()) {
2127  // Scratch wave offset passed in system SGPR.
2128  unsigned PrivateSegmentWaveByteOffsetReg;
2129 
2130  if (IsShader) {
2131  PrivateSegmentWaveByteOffsetReg =
2132  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2133 
2134  // This is true if the scratch wave byte offset doesn't have a fixed
2135  // location.
2136  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2137  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2138  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2139  }
2140  } else
2141  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2142 
2143  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2144  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2145  }
2146 }
2147 
2149  MachineFunction &MF,
2150  const SIRegisterInfo &TRI,
2152  // Now that we've figured out where the scratch register inputs are, see if
2153  // should reserve the arguments and use them directly.
2154  MachineFrameInfo &MFI = MF.getFrameInfo();
2155  bool HasStackObjects = MFI.hasStackObjects();
2156  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2157 
2158  // Record that we know we have non-spill stack objects so we don't need to
2159  // check all stack objects later.
2160  if (HasStackObjects)
2161  Info.setHasNonSpillStackObjects(true);
2162 
2163  // Everything live out of a block is spilled with fast regalloc, so it's
2164  // almost certain that spilling will be required.
2165  if (TM.getOptLevel() == CodeGenOpt::None)
2166  HasStackObjects = true;
2167 
2168  // For now assume stack access is needed in any callee functions, so we need
2169  // the scratch registers to pass in.
2170  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2171 
2172  if (!ST.enableFlatScratch()) {
2173  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2174  // If we have stack objects, we unquestionably need the private buffer
2175  // resource. For the Code Object V2 ABI, this will be the first 4 user
2176  // SGPR inputs. We can reserve those and use them directly.
2177 
2178  Register PrivateSegmentBufferReg =
2180  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2181  } else {
2182  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2183  // We tentatively reserve the last registers (skipping the last registers
2184  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2185  // we'll replace these with the ones immediately after those which were
2186  // really allocated. In the prologue copies will be inserted from the
2187  // argument to these reserved registers.
2188 
2189  // Without HSA, relocations are used for the scratch pointer and the
2190  // buffer resource setup is always inserted in the prologue. Scratch wave
2191  // offset is still in an input SGPR.
2192  Info.setScratchRSrcReg(ReservedBufferReg);
2193  }
2194  }
2195 
2197 
2198  // For entry functions we have to set up the stack pointer if we use it,
2199  // whereas non-entry functions get this "for free". This means there is no
2200  // intrinsic advantage to using S32 over S34 in cases where we do not have
2201  // calls but do need a frame pointer (i.e. if we are requested to have one
2202  // because frame pointer elimination is disabled). To keep things simple we
2203  // only ever use S32 as the call ABI stack pointer, and so using it does not
2204  // imply we need a separate frame pointer.
2205  //
2206  // Try to use s32 as the SP, but move it if it would interfere with input
2207  // arguments. This won't work with calls though.
2208  //
2209  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2210  // registers.
2211  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2212  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2213  } else {
2215 
2216  if (MFI.hasCalls())
2217  report_fatal_error("call in graphics shader with too many input SGPRs");
2218 
2219  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2220  if (!MRI.isLiveIn(Reg)) {
2221  Info.setStackPtrOffsetReg(Reg);
2222  break;
2223  }
2224  }
2225 
2226  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2227  report_fatal_error("failed to find register for SP");
2228  }
2229 
2230  // hasFP should be accurate for entry functions even before the frame is
2231  // finalized, because it does not rely on the known stack size, only
2232  // properties like whether variable sized objects are present.
2233  if (ST.getFrameLowering()->hasFP(MF)) {
2234  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2235  }
2236 }
2237 
2240  return !Info->isEntryFunction();
2241 }
2242 
2244 
2245 }
2246 
2248  MachineBasicBlock *Entry,
2249  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2251 
2252  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2253  if (!IStart)
2254  return;
2255 
2256  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2257  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2258  MachineBasicBlock::iterator MBBI = Entry->begin();
2259  for (const MCPhysReg *I = IStart; *I; ++I) {
2260  const TargetRegisterClass *RC = nullptr;
2261  if (AMDGPU::SReg_64RegClass.contains(*I))
2262  RC = &AMDGPU::SGPR_64RegClass;
2263  else if (AMDGPU::SReg_32RegClass.contains(*I))
2264  RC = &AMDGPU::SGPR_32RegClass;
2265  else
2266  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2267 
2268  Register NewVR = MRI->createVirtualRegister(RC);
2269  // Create copy from CSR to a virtual register.
2270  Entry->addLiveIn(*I);
2271  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2272  .addReg(*I);
2273 
2274  // Insert the copy-back instructions right before the terminator.
2275  for (auto *Exit : Exits)
2276  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2277  TII->get(TargetOpcode::COPY), *I)
2278  .addReg(NewVR);
2279  }
2280 }
2281 
2283  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2284  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2285  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2287 
2288  MachineFunction &MF = DAG.getMachineFunction();
2289  const Function &Fn = MF.getFunction();
2290  FunctionType *FType = MF.getFunction().getFunctionType();
2292 
2293  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2294  DiagnosticInfoUnsupported NoGraphicsHSA(
2295  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2296  DAG.getContext()->diagnose(NoGraphicsHSA);
2297  return DAG.getEntryNode();
2298  }
2299 
2300  Info->allocateModuleLDSGlobal(Fn);
2301 
2304  BitVector Skipped(Ins.size());
2305  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2306  *DAG.getContext());
2307 
2308  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2309  bool IsKernel = AMDGPU::isKernel(CallConv);
2310  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2311 
2312  if (IsGraphics) {
2313  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2314  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2315  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2316  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2317  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2318  !Info->hasWorkItemIDZ());
2319  }
2320 
2321  if (CallConv == CallingConv::AMDGPU_PS) {
2322  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2323 
2324  // At least one interpolation mode must be enabled or else the GPU will
2325  // hang.
2326  //
2327  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2328  // set PSInputAddr, the user wants to enable some bits after the compilation
2329  // based on run-time states. Since we can't know what the final PSInputEna
2330  // will look like, so we shouldn't do anything here and the user should take
2331  // responsibility for the correct programming.
2332  //
2333  // Otherwise, the following restrictions apply:
2334  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2335  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2336  // enabled too.
2337  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2338  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2339  CCInfo.AllocateReg(AMDGPU::VGPR0);
2340  CCInfo.AllocateReg(AMDGPU::VGPR1);
2341  Info->markPSInputAllocated(0);
2342  Info->markPSInputEnabled(0);
2343  }
2344  if (Subtarget->isAmdPalOS()) {
2345  // For isAmdPalOS, the user does not enable some bits after compilation
2346  // based on run-time states; the register values being generated here are
2347  // the final ones set in hardware. Therefore we need to apply the
2348  // workaround to PSInputAddr and PSInputEnable together. (The case where
2349  // a bit is set in PSInputAddr but not PSInputEnable is where the
2350  // frontend set up an input arg for a particular interpolation mode, but
2351  // nothing uses that input arg. Really we should have an earlier pass
2352  // that removes such an arg.)
2353  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2354  if ((PsInputBits & 0x7F) == 0 ||
2355  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2356  Info->markPSInputEnabled(
2357  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2358  }
2359  } else if (IsKernel) {
2360  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2361  } else {
2362  Splits.append(Ins.begin(), Ins.end());
2363  }
2364 
2365  if (IsEntryFunc) {
2366  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2367  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2368  } else if (!IsGraphics) {
2369  // For the fixed ABI, pass workitem IDs in the last argument register.
2370  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2371  }
2372 
2373  if (IsKernel) {
2375  } else {
2376  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2377  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2378  }
2379 
2380  SmallVector<SDValue, 16> Chains;
2381 
2382  // FIXME: This is the minimum kernel argument alignment. We should improve
2383  // this to the maximum alignment of the arguments.
2384  //
2385  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2386  // kern arg offset.
2387  const Align KernelArgBaseAlign = Align(16);
2388 
2389  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2390  const ISD::InputArg &Arg = Ins[i];
2391  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2392  InVals.push_back(DAG.getUNDEF(Arg.VT));
2393  continue;
2394  }
2395 
2396  CCValAssign &VA = ArgLocs[ArgIdx++];
2397  MVT VT = VA.getLocVT();
2398 
2399  if (IsEntryFunc && VA.isMemLoc()) {
2400  VT = Ins[i].VT;
2401  EVT MemVT = VA.getLocVT();
2402 
2403  const uint64_t Offset = VA.getLocMemOffset();
2404  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2405 
2406  if (Arg.Flags.isByRef()) {
2407  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2408 
2409  const GCNTargetMachine &TM =
2410  static_cast<const GCNTargetMachine &>(getTargetMachine());
2411  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2412  Arg.Flags.getPointerAddrSpace())) {
2413  Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2414  Arg.Flags.getPointerAddrSpace());
2415  }
2416 
2417  InVals.push_back(Ptr);
2418  continue;
2419  }
2420 
2421  SDValue Arg = lowerKernargMemParameter(
2422  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2423  Chains.push_back(Arg.getValue(1));
2424 
2425  auto *ParamTy =
2426  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2427  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2428  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2429  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2430  // On SI local pointers are just offsets into LDS, so they are always
2431  // less than 16-bits. On CI and newer they could potentially be
2432  // real pointers, so we can't guarantee their size.
2433  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2434  DAG.getValueType(MVT::i16));
2435  }
2436 
2437  InVals.push_back(Arg);
2438  continue;
2439  } else if (!IsEntryFunc && VA.isMemLoc()) {
2440  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2441  InVals.push_back(Val);
2442  if (!Arg.Flags.isByVal())
2443  Chains.push_back(Val.getValue(1));
2444  continue;
2445  }
2446 
2447  assert(VA.isRegLoc() && "Parameter must be in a register!");
2448 
2449  Register Reg = VA.getLocReg();
2450  const TargetRegisterClass *RC = nullptr;
2451  if (AMDGPU::VGPR_32RegClass.contains(Reg))
2452  RC = &AMDGPU::VGPR_32RegClass;
2453  else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2454  RC = &AMDGPU::SGPR_32RegClass;
2455  else
2456  llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2457  EVT ValVT = VA.getValVT();
2458 
2459  Reg = MF.addLiveIn(Reg, RC);
2460  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2461 
2462  if (Arg.Flags.isSRet()) {
2463  // The return object should be reasonably addressable.
2464 
2465  // FIXME: This helps when the return is a real sret. If it is a
2466  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2467  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2468  unsigned NumBits
2470  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2471  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2472  }
2473 
2474  // If this is an 8 or 16-bit value, it is really passed promoted
2475  // to 32 bits. Insert an assert[sz]ext to capture this, then
2476  // truncate to the right size.
2477  switch (VA.getLocInfo()) {
2478  case CCValAssign::Full:
2479  break;
2480  case CCValAssign::BCvt:
2481  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2482  break;
2483  case CCValAssign::SExt:
2484  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2485  DAG.getValueType(ValVT));
2486  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2487  break;
2488  case CCValAssign::ZExt:
2489  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2490  DAG.getValueType(ValVT));
2491  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2492  break;
2493  case CCValAssign::AExt:
2494  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2495  break;
2496  default:
2497  llvm_unreachable("Unknown loc info!");
2498  }
2499 
2500  InVals.push_back(Val);
2501  }
2502 
2503  // Start adding system SGPRs.
2504  if (IsEntryFunc) {
2505  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2506  } else {
2507  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2508  if (!IsGraphics)
2509  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2510  }
2511 
2512  auto &ArgUsageInfo =
2514  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2515 
2516  unsigned StackArgSize = CCInfo.getNextStackOffset();
2517  Info->setBytesInStackArgArea(StackArgSize);
2518 
2519  return Chains.empty() ? Chain :
2520  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2521 }
2522 
2523 // TODO: If return values can't fit in registers, we should return as many as
2524 // possible in registers before passing on stack.
2526  CallingConv::ID CallConv,
2527  MachineFunction &MF, bool IsVarArg,
2528  const SmallVectorImpl<ISD::OutputArg> &Outs,
2529  LLVMContext &Context) const {
2530  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2531  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2532  // for shaders. Vector types should be explicitly handled by CC.
2533  if (AMDGPU::isEntryFunctionCC(CallConv))
2534  return true;
2535 
2537  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2538  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2539 }
2540 
2541 SDValue
2543  bool isVarArg,
2544  const SmallVectorImpl<ISD::OutputArg> &Outs,
2545  const SmallVectorImpl<SDValue> &OutVals,
2546  const SDLoc &DL, SelectionDAG &DAG) const {
2547  MachineFunction &MF = DAG.getMachineFunction();
2549 
2550  if (AMDGPU::isKernel(CallConv)) {
2551  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2552  OutVals, DL, DAG);
2553  }
2554 
2555  bool IsShader = AMDGPU::isShader(CallConv);
2556 
2557  Info->setIfReturnsVoid(Outs.empty());
2558  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2559 
2560  // CCValAssign - represent the assignment of the return value to a location.
2563 
2564  // CCState - Info about the registers and stack slots.
2565  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2566  *DAG.getContext());
2567 
2568  // Analyze outgoing return values.
2569  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2570 
2571  SDValue Flag;
2572  SmallVector<SDValue, 48> RetOps;
2573  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2574 
2575  // Copy the result values into the output registers.
2576  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2577  ++I, ++RealRVLocIdx) {
2578  CCValAssign &VA = RVLocs[I];
2579  assert(VA.isRegLoc() && "Can only return in registers!");
2580  // TODO: Partially return in registers if return values don't fit.
2581  SDValue Arg = OutVals[RealRVLocIdx];
2582 
2583  // Copied from other backends.
2584  switch (VA.getLocInfo()) {
2585  case CCValAssign::Full:
2586  break;
2587  case CCValAssign::BCvt:
2588  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2589  break;
2590  case CCValAssign::SExt:
2591  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2592  break;
2593  case CCValAssign::ZExt:
2594  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2595  break;
2596  case CCValAssign::AExt:
2597  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2598  break;
2599  default:
2600  llvm_unreachable("Unknown loc info!");
2601  }
2602 
2603  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2604  Flag = Chain.getValue(1);
2605  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2606  }
2607 
2608  // FIXME: Does sret work properly?
2609  if (!Info->isEntryFunction()) {
2610  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2611  const MCPhysReg *I =
2612  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2613  if (I) {
2614  for (; *I; ++I) {
2615  if (AMDGPU::SReg_64RegClass.contains(*I))
2616  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2617  else if (AMDGPU::SReg_32RegClass.contains(*I))
2618  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2619  else
2620  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2621  }
2622  }
2623  }
2624 
2625  // Update chain and glue.
2626  RetOps[0] = Chain;
2627  if (Flag.getNode())
2628  RetOps.push_back(Flag);
2629 
2630  unsigned Opc = AMDGPUISD::ENDPGM;
2631  if (!IsWaveEnd)
2633  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2634 }
2635 
2637  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2638  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2639  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2640  SDValue ThisVal) const {
2641  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2642 
2643  // Assign locations to each value returned by this call.
2645  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2646  *DAG.getContext());
2647  CCInfo.AnalyzeCallResult(Ins, RetCC);
2648 
2649  // Copy all of the result registers out of their specified physreg.
2650  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2651  CCValAssign VA = RVLocs[i];
2652  SDValue Val;
2653 
2654  if (VA.isRegLoc()) {
2655  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2656  Chain = Val.getValue(1);
2657  InFlag = Val.getValue(2);
2658  } else if (VA.isMemLoc()) {
2659  report_fatal_error("TODO: return values in memory");
2660  } else
2661  llvm_unreachable("unknown argument location type");
2662 
2663  switch (VA.getLocInfo()) {
2664  case CCValAssign::Full:
2665  break;
2666  case CCValAssign::BCvt:
2667  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2668  break;
2669  case CCValAssign::ZExt:
2670  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2671  DAG.getValueType(VA.getValVT()));
2672  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2673  break;
2674  case CCValAssign::SExt:
2675  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2676  DAG.getValueType(VA.getValVT()));
2677  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2678  break;
2679  case CCValAssign::AExt:
2680  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2681  break;
2682  default:
2683  llvm_unreachable("Unknown loc info!");
2684  }
2685 
2686  InVals.push_back(Val);
2687  }
2688 
2689  return Chain;
2690 }
2691 
2692 // Add code to pass special inputs required depending on used features separate
2693 // from the explicit user arguments present in the IR.
2695  CallLoweringInfo &CLI,
2696  CCState &CCInfo,
2697  const SIMachineFunctionInfo &Info,
2698  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2699  SmallVectorImpl<SDValue> &MemOpChains,
2700  SDValue Chain) const {
2701  // If we don't have a call site, this was a call inserted by
2702  // legalization. These can never use special inputs.
2703  if (!CLI.CB)
2704  return;
2705 
2706  SelectionDAG &DAG = CLI.DAG;
2707  const SDLoc &DL = CLI.DL;
2708  const Function &F = DAG.getMachineFunction().getFunction();
2709 
2710  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2711  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2712 
2713  const AMDGPUFunctionArgInfo *CalleeArgInfo
2715  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2716  auto &ArgUsageInfo =
2718  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2719  }
2720 
2721  // TODO: Unify with private memory register handling. This is complicated by
2722  // the fact that at least in kernels, the input argument is not necessarily
2723  // in the same location as the input.
2724  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2726  {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2727  {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2728  {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2729  {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2730  {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2731  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2732  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
2733  };
2734 
2735  for (auto Attr : ImplicitAttrs) {
2736  const ArgDescriptor *OutgoingArg;
2737  const TargetRegisterClass *ArgRC;
2738  LLT ArgTy;
2739 
2740  AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2741 
2742  // If the callee does not use the attribute value, skip copying the value.
2743  if (CLI.CB->hasFnAttr(Attr.second))
2744  continue;
2745 
2746  std::tie(OutgoingArg, ArgRC, ArgTy) =
2747  CalleeArgInfo->getPreloadedValue(InputID);
2748  if (!OutgoingArg)
2749  continue;
2750 
2751  const ArgDescriptor *IncomingArg;
2752  const TargetRegisterClass *IncomingArgRC;
2753  LLT Ty;
2754  std::tie(IncomingArg, IncomingArgRC, Ty) =
2755  CallerArgInfo.getPreloadedValue(InputID);
2756  assert(IncomingArgRC == ArgRC);
2757 
2758  // All special arguments are ints for now.
2759  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2760  SDValue InputReg;
2761 
2762  if (IncomingArg) {
2763  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2764  } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2765  // The implicit arg ptr is special because it doesn't have a corresponding
2766  // input for kernels, and is computed from the kernarg segment pointer.
2767  InputReg = getImplicitArgPtr(DAG, DL);
2768  } else {
2769  // We may have proven the input wasn't needed, although the ABI is
2770  // requiring it. We just need to allocate the register appropriately.
2771  InputReg = DAG.getUNDEF(ArgVT);
2772  }
2773 
2774  if (OutgoingArg->isRegister()) {
2775  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2776  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2777  report_fatal_error("failed to allocate implicit input argument");
2778  } else {
2779  unsigned SpecialArgOffset =
2780  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2781  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2782  SpecialArgOffset);
2783  MemOpChains.push_back(ArgStore);
2784  }
2785  }
2786 
2787  // Pack workitem IDs into a single register or pass it as is if already
2788  // packed.
2789  const ArgDescriptor *OutgoingArg;
2790  const TargetRegisterClass *ArgRC;
2791  LLT Ty;
2792 
2793  std::tie(OutgoingArg, ArgRC, Ty) =
2795  if (!OutgoingArg)
2796  std::tie(OutgoingArg, ArgRC, Ty) =
2798  if (!OutgoingArg)
2799  std::tie(OutgoingArg, ArgRC, Ty) =
2801  if (!OutgoingArg)
2802  return;
2803 
2804  const ArgDescriptor *IncomingArgX = std::get<0>(
2806  const ArgDescriptor *IncomingArgY = std::get<0>(
2808  const ArgDescriptor *IncomingArgZ = std::get<0>(
2810 
2811  SDValue InputReg;
2812  SDLoc SL;
2813 
2814  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2815  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2816  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2817 
2818  // If incoming ids are not packed we need to pack them.
2819  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2820  NeedWorkItemIDX) {
2821  if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2822  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2823  } else {
2824  InputReg = DAG.getConstant(0, DL, MVT::i32);
2825  }
2826  }
2827 
2828  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2829  NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
2830  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2831  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2832  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2833  InputReg = InputReg.getNode() ?
2834  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2835  }
2836 
2837  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2838  NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
2839  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2840  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2841  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2842  InputReg = InputReg.getNode() ?
2843  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2844  }
2845 
2846  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2847  if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2848  // We're in a situation where the outgoing function requires the workitem
2849  // ID, but the calling function does not have it (e.g a graphics function
2850  // calling a C calling convention function). This is illegal, but we need
2851  // to produce something.
2852  InputReg = DAG.getUNDEF(MVT::i32);
2853  } else {
2854  // Workitem ids are already packed, any of present incoming arguments
2855  // will carry all required fields.
2857  IncomingArgX ? *IncomingArgX :
2858  IncomingArgY ? *IncomingArgY :
2859  *IncomingArgZ, ~0u);
2860  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2861  }
2862  }
2863 
2864  if (OutgoingArg->isRegister()) {
2865  if (InputReg)
2866  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2867 
2868  CCInfo.AllocateReg(OutgoingArg->getRegister());
2869  } else {
2870  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2871  if (InputReg) {
2872  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2873  SpecialArgOffset);
2874  MemOpChains.push_back(ArgStore);
2875  }
2876  }
2877 }
2878 
2880  return CC == CallingConv::Fast;
2881 }
2882 
2883 /// Return true if we might ever do TCO for calls with this calling convention.
2885  switch (CC) {
2886  case CallingConv::C:
2888  return true;
2889  default:
2890  return canGuaranteeTCO(CC);
2891  }
2892 }
2893 
2895  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2896  const SmallVectorImpl<ISD::OutputArg> &Outs,
2897  const SmallVectorImpl<SDValue> &OutVals,
2898  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2899  if (!mayTailCallThisCC(CalleeCC))
2900  return false;
2901 
2902  // For a divergent call target, we need to do a waterfall loop over the
2903  // possible callees which precludes us from using a simple jump.
2904  if (Callee->isDivergent())
2905  return false;
2906 
2907  MachineFunction &MF = DAG.getMachineFunction();
2908  const Function &CallerF = MF.getFunction();
2909  CallingConv::ID CallerCC = CallerF.getCallingConv();
2911  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2912 
2913  // Kernels aren't callable, and don't have a live in return address so it
2914  // doesn't make sense to do a tail call with entry functions.
2915  if (!CallerPreserved)
2916  return false;
2917 
2918  bool CCMatch = CallerCC == CalleeCC;
2919 
2921  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2922  return true;
2923  return false;
2924  }
2925 
2926  // TODO: Can we handle var args?
2927  if (IsVarArg)
2928  return false;
2929 
2930  for (const Argument &Arg : CallerF.args()) {
2931  if (Arg.hasByValAttr())
2932  return false;
2933  }
2934 
2935  LLVMContext &Ctx = *DAG.getContext();
2936 
2937  // Check that the call results are passed in the same way.
2938  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2939  CCAssignFnForCall(CalleeCC, IsVarArg),
2940  CCAssignFnForCall(CallerCC, IsVarArg)))
2941  return false;
2942 
2943  // The callee has to preserve all registers the caller needs to preserve.
2944  if (!CCMatch) {
2945  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2946  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2947  return false;
2948  }
2949 
2950  // Nothing more to check if the callee is taking no arguments.
2951  if (Outs.empty())
2952  return true;
2953 
2955  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2956 
2957  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2958 
2959  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2960  // If the stack arguments for this call do not fit into our own save area then
2961  // the call cannot be made tail.
2962  // TODO: Is this really necessary?
2963  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2964  return false;
2965 
2966  const MachineRegisterInfo &MRI = MF.getRegInfo();
2967  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2968 }
2969 
2971  if (!CI->isTailCall())
2972  return false;
2973 
2974  const Function *ParentFn = CI->getParent()->getParent();
2975  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2976  return false;
2977  return true;
2978 }
2979 
2980 // The wave scratch offset register is used as the global base pointer.
2982  SmallVectorImpl<SDValue> &InVals) const {
2983  SelectionDAG &DAG = CLI.DAG;
2984  const SDLoc &DL = CLI.DL;
2986  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2988  SDValue Chain = CLI.Chain;
2989  SDValue Callee = CLI.Callee;
2990  bool &IsTailCall = CLI.IsTailCall;
2991  CallingConv::ID CallConv = CLI.CallConv;
2992  bool IsVarArg = CLI.IsVarArg;
2993  bool IsSibCall = false;
2994  bool IsThisReturn = false;
2995  MachineFunction &MF = DAG.getMachineFunction();
2996 
2997  if (Callee.isUndef() || isNullConstant(Callee)) {
2998  if (!CLI.IsTailCall) {
2999  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3000  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3001  }
3002 
3003  return Chain;
3004  }
3005 
3006  if (IsVarArg) {
3007  return lowerUnhandledCall(CLI, InVals,
3008  "unsupported call to variadic function ");
3009  }
3010 
3011  if (!CLI.CB)
3012  report_fatal_error("unsupported libcall legalization");
3013 
3014  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3015  return lowerUnhandledCall(CLI, InVals,
3016  "unsupported required tail call to function ");
3017  }
3018 
3019  if (AMDGPU::isShader(CallConv)) {
3020  // Note the issue is with the CC of the called function, not of the call
3021  // itself.
3022  return lowerUnhandledCall(CLI, InVals,
3023  "unsupported call to a shader function ");
3024  }
3025 
3027  CallConv != CallingConv::AMDGPU_Gfx) {
3028  // Only allow calls with specific calling conventions.
3029  return lowerUnhandledCall(CLI, InVals,
3030  "unsupported calling convention for call from "
3031  "graphics shader of function ");
3032  }
3033 
3034  if (IsTailCall) {
3035  IsTailCall = isEligibleForTailCallOptimization(
3036  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3037  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3038  report_fatal_error("failed to perform tail call elimination on a call "
3039  "site marked musttail");
3040  }
3041 
3042  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3043 
3044  // A sibling call is one where we're under the usual C ABI and not planning
3045  // to change that but can still do a tail call:
3046  if (!TailCallOpt && IsTailCall)
3047  IsSibCall = true;
3048 
3049  if (IsTailCall)
3050  ++NumTailCalls;
3051  }
3052 
3055  SmallVector<SDValue, 8> MemOpChains;
3056 
3057  // Analyze operands of the call, assigning locations to each operand.
3059  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3060  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3061 
3062  if (CallConv != CallingConv::AMDGPU_Gfx) {
3063  // With a fixed ABI, allocate fixed registers before user arguments.
3064  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3065  }
3066 
3067  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3068 
3069  // Get a count of how many bytes are to be pushed on the stack.
3070  unsigned NumBytes = CCInfo.getNextStackOffset();
3071 
3072  if (IsSibCall) {
3073  // Since we're not changing the ABI to make this a tail call, the memory
3074  // operands are already available in the caller's incoming argument space.
3075  NumBytes = 0;
3076  }
3077 
3078  // FPDiff is the byte offset of the call's argument area from the callee's.
3079  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3080  // by this amount for a tail call. In a sibling call it must be 0 because the
3081  // caller will deallocate the entire stack and the callee still expects its
3082  // arguments to begin at SP+0. Completely unused for non-tail calls.
3083  int32_t FPDiff = 0;
3084  MachineFrameInfo &MFI = MF.getFrameInfo();
3085 
3086  // Adjust the stack pointer for the new arguments...
3087  // These operations are automatically eliminated by the prolog/epilog pass
3088  if (!IsSibCall) {
3089  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3090 
3091  if (!Subtarget->enableFlatScratch()) {
3092  SmallVector<SDValue, 4> CopyFromChains;
3093 
3094  // In the HSA case, this should be an identity copy.
3095  SDValue ScratchRSrcReg
3096  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3097  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3098  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3099  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3100  }
3101  }
3102 
3103  MVT PtrVT = MVT::i32;
3104 
3105  // Walk the register/memloc assignments, inserting copies/loads.
3106  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3107  CCValAssign &VA = ArgLocs[i];
3108  SDValue Arg = OutVals[i];
3109 
3110  // Promote the value if needed.
3111  switch (VA.getLocInfo()) {
3112  case CCValAssign::Full:
3113  break;
3114  case CCValAssign::BCvt:
3115  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3116  break;
3117  case CCValAssign::ZExt:
3118  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3119  break;
3120  case CCValAssign::SExt:
3121  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3122  break;
3123  case CCValAssign::AExt:
3124  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3125  break;
3126  case CCValAssign::FPExt:
3127  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3128  break;
3129  default:
3130  llvm_unreachable("Unknown loc info!");
3131  }
3132 
3133  if (VA.isRegLoc()) {
3134  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3135  } else {
3136  assert(VA.isMemLoc());
3137 
3138  SDValue DstAddr;
3139  MachinePointerInfo DstInfo;
3140 
3141  unsigned LocMemOffset = VA.getLocMemOffset();
3142  int32_t Offset = LocMemOffset;
3143 
3144  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3145  MaybeAlign Alignment;
3146 
3147  if (IsTailCall) {
3148  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3149  unsigned OpSize = Flags.isByVal() ?
3150  Flags.getByValSize() : VA.getValVT().getStoreSize();
3151 
3152  // FIXME: We can have better than the minimum byval required alignment.
3153  Alignment =
3154  Flags.isByVal()
3155  ? Flags.getNonZeroByValAlign()
3156  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3157 
3158  Offset = Offset + FPDiff;
3159  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3160 
3161  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3162  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3163 
3164  // Make sure any stack arguments overlapping with where we're storing
3165  // are loaded before this eventual operation. Otherwise they'll be
3166  // clobbered.
3167 
3168  // FIXME: Why is this really necessary? This seems to just result in a
3169  // lot of code to copy the stack and write them back to the same
3170  // locations, which are supposed to be immutable?
3171  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3172  } else {
3173  // Stores to the argument stack area are relative to the stack pointer.
3174  SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3175  MVT::i32);
3176  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3177  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3178  Alignment =
3179  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3180  }
3181 
3182  if (Outs[i].Flags.isByVal()) {
3183  SDValue SizeNode =
3184  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3185  SDValue Cpy =
3186  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3187  Outs[i].Flags.getNonZeroByValAlign(),
3188  /*isVol = */ false, /*AlwaysInline = */ true,
3189  /*isTailCall = */ false, DstInfo,
3191 
3192  MemOpChains.push_back(Cpy);
3193  } else {
3194  SDValue Store =
3195  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3196  MemOpChains.push_back(Store);
3197  }
3198  }
3199  }
3200 
3201  if (!MemOpChains.empty())
3202  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3203 
3204  // Build a sequence of copy-to-reg nodes chained together with token chain
3205  // and flag operands which copy the outgoing args into the appropriate regs.
3206  SDValue InFlag;
3207  for (auto &RegToPass : RegsToPass) {
3208  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3209  RegToPass.second, InFlag);
3210  InFlag = Chain.getValue(1);
3211  }
3212 
3213 
3214  // We don't usually want to end the call-sequence here because we would tidy
3215  // the frame up *after* the call, however in the ABI-changing tail-call case
3216  // we've carefully laid out the parameters so that when sp is reset they'll be
3217  // in the correct location.
3218  if (IsTailCall && !IsSibCall) {
3219  Chain = DAG.getCALLSEQ_END(Chain,
3220  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3221  DAG.getTargetConstant(0, DL, MVT::i32),
3222  InFlag, DL);
3223  InFlag = Chain.getValue(1);
3224  }
3225 
3226  std::vector<SDValue> Ops;
3227  Ops.push_back(Chain);
3228  Ops.push_back(Callee);
3229  // Add a redundant copy of the callee global which will not be legalized, as
3230  // we need direct access to the callee later.
3231  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3232  const GlobalValue *GV = GSD->getGlobal();
3233  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3234  } else {
3235  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3236  }
3237 
3238  if (IsTailCall) {
3239  // Each tail call may have to adjust the stack by a different amount, so
3240  // this information must travel along with the operation for eventual
3241  // consumption by emitEpilogue.
3242  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3243  }
3244 
3245  // Add argument registers to the end of the list so that they are known live
3246  // into the call.
3247  for (auto &RegToPass : RegsToPass) {
3248  Ops.push_back(DAG.getRegister(RegToPass.first,
3249  RegToPass.second.getValueType()));
3250  }
3251 
3252  // Add a register mask operand representing the call-preserved registers.
3253 
3254  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3255  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3256  assert(Mask && "Missing call preserved mask for calling convention");
3257  Ops.push_back(DAG.getRegisterMask(Mask));
3258 
3259  if (InFlag.getNode())
3260  Ops.push_back(InFlag);
3261 
3262  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3263 
3264  // If we're doing a tall call, use a TC_RETURN here rather than an
3265  // actual call instruction.
3266  if (IsTailCall) {
3267  MFI.setHasTailCall();
3268  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3269  }
3270 
3271  // Returns a chain and a flag for retval copy to use.
3272  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3273  Chain = Call.getValue(0);
3274  InFlag = Call.getValue(1);
3275 
3276  uint64_t CalleePopBytes = NumBytes;
3277  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3278  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3279  InFlag, DL);
3280  if (!Ins.empty())
3281  InFlag = Chain.getValue(1);
3282 
3283  // Handle result values, copying them out of physregs into vregs that we
3284  // return.
3285  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3286  InVals, IsThisReturn,
3287  IsThisReturn ? OutVals[0] : SDValue());
3288 }
3289 
3290 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3291 // except for applying the wave size scale to the increment amount.
3293  SDValue Op, SelectionDAG &DAG) const {
3294  const MachineFunction &MF = DAG.getMachineFunction();
3296 
3297  SDLoc dl(Op);
3298  EVT VT = Op.getValueType();
3299  SDValue Tmp1 = Op;
3300  SDValue Tmp2 = Op.getValue(1);
3301  SDValue Tmp3 = Op.getOperand(2);
3302  SDValue Chain = Tmp1.getOperand(0);
3303 
3304  Register SPReg = Info->getStackPtrOffsetReg();
3305 
3306  // Chain the dynamic stack allocation so that it doesn't modify the stack
3307  // pointer when other instructions are using the stack.
3308  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3309 
3310  SDValue Size = Tmp2.getOperand(1);
3311  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3312  Chain = SP.getValue(1);
3313  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3314  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3315  const TargetFrameLowering *TFL = ST.getFrameLowering();
3316  unsigned Opc =
3317  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3318  ISD::ADD : ISD::SUB;
3319 
3320  SDValue ScaledSize = DAG.getNode(
3321  ISD::SHL, dl, VT, Size,
3322  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3323 
3324  Align StackAlign = TFL->getStackAlign();
3325  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3326  if (Alignment && *Alignment > StackAlign) {
3327  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3328  DAG.getConstant(-(uint64_t)Alignment->value()
3329  << ST.getWavefrontSizeLog2(),
3330  dl, VT));
3331  }
3332 
3333  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3334  Tmp2 = DAG.getCALLSEQ_END(
3335  Chain, DAG.getIntPtrConstant(0, dl, true),
3336  DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3337 
3338  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3339 }
3340 
3342  SelectionDAG &DAG) const {
3343  // We only handle constant sizes here to allow non-entry block, static sized
3344  // allocas. A truly dynamic value is more difficult to support because we
3345  // don't know if the size value is uniform or not. If the size isn't uniform,
3346  // we would need to do a wave reduction to get the maximum size to know how
3347  // much to increment the uniform stack pointer.
3348  SDValue Size = Op.getOperand(1);
3349  if (isa<ConstantSDNode>(Size))
3350  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3351 
3353 }
3354 
3356  const MachineFunction &MF) const {
3358  .Case("m0", AMDGPU::M0)
3359  .Case("exec", AMDGPU::EXEC)
3360  .Case("exec_lo", AMDGPU::EXEC_LO)
3361  .Case("exec_hi", AMDGPU::EXEC_HI)
3362  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3363  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3364  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3365  .Default(Register());
3366 
3367  if (Reg == AMDGPU::NoRegister) {
3368  report_fatal_error(Twine("invalid register name \""
3369  + StringRef(RegName) + "\"."));
3370 
3371  }
3372 
3373  if (!Subtarget->hasFlatScrRegister() &&
3374  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3375  report_fatal_error(Twine("invalid register \""
3376  + StringRef(RegName) + "\" for subtarget."));
3377  }
3378 
3379  switch (Reg) {
3380  case AMDGPU::M0:
3381  case AMDGPU::EXEC_LO:
3382  case AMDGPU::EXEC_HI:
3383  case AMDGPU::FLAT_SCR_LO:
3384  case AMDGPU::FLAT_SCR_HI:
3385  if (VT.getSizeInBits() == 32)
3386  return Reg;
3387  break;
3388  case AMDGPU::EXEC:
3389  case AMDGPU::FLAT_SCR:
3390  if (VT.getSizeInBits() == 64)
3391  return Reg;
3392  break;
3393  default:
3394  llvm_unreachable("missing register type checking");
3395  }
3396 
3397  report_fatal_error(Twine("invalid type for register \""
3398  + StringRef(RegName) + "\"."));
3399 }
3400 
3401 // If kill is not the last instruction, split the block so kill is always a
3402 // proper terminator.
3405  MachineBasicBlock *BB) const {
3406  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3407  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3408  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3409  return SplitBB;
3410 }
3411 
3412 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3413 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3414 // be the first instruction in the remainder block.
3415 //
3416 /// \returns { LoopBody, Remainder }
3417 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3419  MachineFunction *MF = MBB.getParent();
3421 
3422  // To insert the loop we need to split the block. Move everything after this
3423  // point to a new block, and insert a new empty block between the two.
3425  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3427  ++MBBI;
3428 
3429  MF->insert(MBBI, LoopBB);
3430  MF->insert(MBBI, RemainderBB);
3431 
3432  LoopBB->addSuccessor(LoopBB);
3433  LoopBB->addSuccessor(RemainderBB);
3434 
3435  // Move the rest of the block into a new block.
3436  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3437 
3438  if (InstInLoop) {
3439  auto Next = std::next(I);
3440 
3441  // Move instruction to loop body.
3442  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3443 
3444  // Move the rest of the block.
3445  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3446  } else {
3447  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3448  }
3449 
3450  MBB.addSuccessor(LoopBB);
3451 
3452  return std::make_pair(LoopBB, RemainderBB);
3453 }
3454 
3455 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3457  MachineBasicBlock *MBB = MI.getParent();
3458  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3459  auto I = MI.getIterator();
3460  auto E = std::next(I);
3461 
3462  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3463  .addImm(0);
3464 
3465  MIBundleBuilder Bundler(*MBB, I, E);
3466  finalizeBundle(*MBB, Bundler.begin());
3467 }
3468 
3471  MachineBasicBlock *BB) const {
3472  const DebugLoc &DL = MI.getDebugLoc();
3473 
3474  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3475 
3476  MachineBasicBlock *LoopBB;
3477  MachineBasicBlock *RemainderBB;
3478  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3479 
3480  // Apparently kill flags are only valid if the def is in the same block?
3481  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3482  Src->setIsKill(false);
3483 
3484  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3485 
3486  MachineBasicBlock::iterator I = LoopBB->end();
3487 
3488  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3490 
3491  // Clear TRAP_STS.MEM_VIOL
3492  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3493  .addImm(0)
3494  .addImm(EncodedReg);
3495 
3497 
3498  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3499 
3500  // Load and check TRAP_STS.MEM_VIOL
3501  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3502  .addImm(EncodedReg);
3503 
3504  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3505  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3507  .addImm(0);
3508  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3509  .addMBB(LoopBB);
3510 
3511  return RemainderBB;
3512 }
3513 
3514 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3515 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3516 // will only do one iteration. In the worst case, this will loop 64 times.
3517 //
3518 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3521  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3522  const DebugLoc &DL, const MachineOperand &Idx,
3523  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3524  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3525  Register &SGPRIdxReg) {
3526 
3527  MachineFunction *MF = OrigBB.getParent();
3528  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3529  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3530  MachineBasicBlock::iterator I = LoopBB.begin();
3531 
3532  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3533  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3534  Register NewExec = MRI.createVirtualRegister(BoolRC);
3535  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3536  Register CondReg = MRI.createVirtualRegister(BoolRC);
3537 
3538  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3539  .addReg(InitReg)
3540  .addMBB(&OrigBB)
3541  .addReg(ResultReg)
3542  .addMBB(&LoopBB);
3543 
3544  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3545  .addReg(InitSaveExecReg)
3546  .addMBB(&OrigBB)
3547  .addReg(NewExec)
3548  .addMBB(&LoopBB);
3549 
3550  // Read the next variant <- also loop target.
3551  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3552  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3553 
3554  // Compare the just read M0 value to all possible Idx values.
3555  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3556  .addReg(CurrentIdxReg)
3557  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3558 
3559  // Update EXEC, save the original EXEC value to VCC.
3560  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3561  : AMDGPU::S_AND_SAVEEXEC_B64),
3562  NewExec)
3563  .addReg(CondReg, RegState::Kill);
3564 
3565  MRI.setSimpleHint(NewExec, CondReg);
3566 
3567  if (UseGPRIdxMode) {
3568  if (Offset == 0) {
3569  SGPRIdxReg = CurrentIdxReg;
3570  } else {
3571  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3572  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3573  .addReg(CurrentIdxReg, RegState::Kill)
3574  .addImm(Offset);
3575  }
3576  } else {
3577  // Move index from VCC into M0
3578  if (Offset == 0) {
3579  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3580  .addReg(CurrentIdxReg, RegState::Kill);
3581  } else {
3582  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3583  .addReg(CurrentIdxReg, RegState::Kill)
3584  .addImm(Offset);
3585  }
3586  }
3587 
3588  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3589  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3590  MachineInstr *InsertPt =
3591  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3592  : AMDGPU::S_XOR_B64_term), Exec)
3593  .addReg(Exec)
3594  .addReg(NewExec);
3595 
3596  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3597  // s_cbranch_scc0?
3598 
3599  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3600  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3601  .addMBB(&LoopBB);
3602 
3603  return InsertPt->getIterator();
3604 }
3605 
3606 // This has slightly sub-optimal regalloc when the source vector is killed by
3607 // the read. The register allocator does not understand that the kill is
3608 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3609 // subregister from it, using 1 more VGPR than necessary. This was saved when
3610 // this was expanded after register allocation.
3613  unsigned InitResultReg, unsigned PhiReg, int Offset,
3614  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3615  MachineFunction *MF = MBB.getParent();
3616  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3617  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3619  const DebugLoc &DL = MI.getDebugLoc();
3621 
3622  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3623  Register DstReg = MI.getOperand(0).getReg();
3624  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3625  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3626  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3627  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3628 
3629  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3630 
3631  // Save the EXEC mask
3632  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3633  .addReg(Exec);
3634 
3635  MachineBasicBlock *LoopBB;
3636  MachineBasicBlock *RemainderBB;
3637  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3638 
3639  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3640 
3641  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3642  InitResultReg, DstReg, PhiReg, TmpExec,
3643  Offset, UseGPRIdxMode, SGPRIdxReg);
3644 
3645  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3647  ++MBBI;
3648  MF->insert(MBBI, LandingPad);
3649  LoopBB->removeSuccessor(RemainderBB);
3650  LandingPad->addSuccessor(RemainderBB);
3651  LoopBB->addSuccessor(LandingPad);
3652  MachineBasicBlock::iterator First = LandingPad->begin();
3653  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3654  .addReg(SaveExec);
3655 
3656  return InsPt;
3657 }
3658 
3659 // Returns subreg index, offset
3660 static std::pair<unsigned, int>
3662  const TargetRegisterClass *SuperRC,
3663  unsigned VecReg,
3664  int Offset) {
3665  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3666 
3667  // Skip out of bounds offsets, or else we would end up using an undefined
3668  // register.
3669  if (Offset >= NumElts || Offset < 0)
3670  return std::make_pair(AMDGPU::sub0, Offset);
3671 
3672  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3673 }
3674 
3677  int Offset) {
3678  MachineBasicBlock *MBB = MI.getParent();
3679  const DebugLoc &DL = MI.getDebugLoc();
3681 
3682  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3683 
3684  assert(Idx->getReg() != AMDGPU::NoRegister);
3685 
3686  if (Offset == 0) {
3687  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3688  } else {
3689  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3690  .add(*Idx)
3691  .addImm(Offset);
3692  }
3693 }
3694 
3697  int Offset) {
3698  MachineBasicBlock *MBB = MI.getParent();
3699  const DebugLoc &DL = MI.getDebugLoc();
3701 
3702  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3703 
3704  if (Offset == 0)
3705  return Idx->getReg();
3706 
3707  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3708  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3709  .add(*Idx)
3710  .addImm(Offset);
3711  return Tmp;
3712 }
3713 
3716  const GCNSubtarget &ST) {
3717  const SIInstrInfo *TII = ST.getInstrInfo();
3718  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3719  MachineFunction *MF = MBB.getParent();
3721 
3722  Register Dst = MI.getOperand(0).getReg();
3723  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3724  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3725  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3726 
3727  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3728  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3729 
3730  unsigned SubReg;
3731  std::tie(SubReg, Offset)
3732  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3733 
3734  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3735 
3736  // Check for a SGPR index.
3737  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3739  const DebugLoc &DL = MI.getDebugLoc();
3740 
3741  if (UseGPRIdxMode) {
3742  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3743  // to avoid interfering with other uses, so probably requires a new
3744  // optimization pass.
3745  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3746 
3747  const MCInstrDesc &GPRIDXDesc =
3748  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3749  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3750  .addReg(SrcReg)
3751  .addReg(Idx)
3752  .addImm(SubReg);
3753  } else {
3754  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3755 
3756  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3757  .addReg(SrcReg, 0, SubReg)
3758  .addReg(SrcReg, RegState::Implicit);
3759  }
3760 
3761  MI.eraseFromParent();
3762 
3763  return &MBB;
3764  }
3765 
3766  // Control flow needs to be inserted if indexing with a VGPR.
3767  const DebugLoc &DL = MI.getDebugLoc();
3769 
3770  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3771  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3772 
3773  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3774 
3775  Register SGPRIdxReg;
3776  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3777  UseGPRIdxMode, SGPRIdxReg);
3778 
3779  MachineBasicBlock *LoopBB = InsPt->getParent();
3780 
3781  if (UseGPRIdxMode) {
3782  const MCInstrDesc &GPRIDXDesc =
3783  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3784 
3785  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3786  .addReg(SrcReg)
3787  .addReg(SGPRIdxReg)
3788  .addImm(SubReg);
3789  } else {
3790  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3791  .addReg(SrcReg, 0, SubReg)
3792  .addReg(SrcReg, RegState::Implicit);
3793  }
3794 
3795  MI.eraseFromParent();
3796 
3797  return LoopBB;
3798 }
3799 
3802  const GCNSubtarget &ST) {
3803  const SIInstrInfo *TII = ST.getInstrInfo();
3804  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3805  MachineFunction *MF = MBB.getParent();
3807 
3808  Register Dst = MI.getOperand(0).getReg();
3809  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3810  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3811  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3812  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3813  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3814  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3815 
3816  // This can be an immediate, but will be folded later.
3817  assert(Val->getReg());
3818 
3819  unsigned SubReg;
3820  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3821  SrcVec->getReg(),
3822  Offset);
3823  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3824 
3825  if (Idx->getReg() == AMDGPU::NoRegister) {
3827  const DebugLoc &DL = MI.getDebugLoc();
3828 
3829  assert(Offset == 0);
3830 
3831  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3832  .add(*SrcVec)
3833  .add(*Val)
3834  .addImm(SubReg);
3835 
3836  MI.eraseFromParent();
3837  return &MBB;
3838  }
3839 
3840  // Check for a SGPR index.
3841  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3843  const DebugLoc &DL = MI.getDebugLoc();
3844 
3845  if (UseGPRIdxMode) {
3846  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3847 
3848  const MCInstrDesc &GPRIDXDesc =
3849  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3850  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3851  .addReg(SrcVec->getReg())
3852  .add(*Val)
3853  .addReg(Idx)
3854  .addImm(SubReg);
3855  } else {
3856  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3857 
3858  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3859  TRI.getRegSizeInBits(*VecRC), 32, false);
3860  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3861  .addReg(SrcVec->getReg())
3862  .add(*Val)
3863  .addImm(SubReg);
3864  }
3865  MI.eraseFromParent();
3866  return &MBB;
3867  }
3868 
3869  // Control flow needs to be inserted if indexing with a VGPR.
3870  if (Val->isReg())
3871  MRI.clearKillFlags(Val->getReg());
3872 
3873  const DebugLoc &DL = MI.getDebugLoc();
3874 
3875  Register PhiReg = MRI.createVirtualRegister(VecRC);
3876 
3877  Register SGPRIdxReg;
3878  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3879  UseGPRIdxMode, SGPRIdxReg);
3880  MachineBasicBlock *LoopBB = InsPt->getParent();
3881 
3882  if (UseGPRIdxMode) {
3883  const MCInstrDesc &GPRIDXDesc =
3884  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3885 
3886  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3887  .addReg(PhiReg)
3888  .add(*Val)
3889  .addReg(SGPRIdxReg)
3890  .addImm(AMDGPU::sub0);
3891  } else {
3892  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3893  TRI.getRegSizeInBits(*VecRC), 32, false);
3894  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3895  .addReg(PhiReg)
3896  .add(*Val)
3897  .addImm(AMDGPU::sub0);
3898  }
3899 
3900  MI.eraseFromParent();
3901  return LoopBB;
3902 }
3903 
3905  MachineInstr &MI, MachineBasicBlock *BB) const {
3906 
3907  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3908  MachineFunction *MF = BB->getParent();
3910 
3911  switch (MI.getOpcode()) {
3912  case AMDGPU::S_UADDO_PSEUDO:
3913  case AMDGPU::S_USUBO_PSEUDO: {
3914  const DebugLoc &DL = MI.getDebugLoc();
3915  MachineOperand &Dest0 = MI.getOperand(0);
3916  MachineOperand &Dest1 = MI.getOperand(1);
3917  MachineOperand &Src0 = MI.getOperand(2);
3918  MachineOperand &Src1 = MI.getOperand(3);
3919 
3920  unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3921  ? AMDGPU::S_ADD_I32
3922  : AMDGPU::S_SUB_I32;
3923  BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3924 
3925  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3926  .addImm(1)
3927  .addImm(0);
3928 
3929  MI.eraseFromParent();
3930  return BB;
3931  }
3932  case AMDGPU::S_ADD_U64_PSEUDO:
3933  case AMDGPU::S_SUB_U64_PSEUDO: {
3934  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3935  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3936  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3937  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3938  const DebugLoc &DL = MI.getDebugLoc();
3939 
3940  MachineOperand &Dest = MI.getOperand(0);
3941  MachineOperand &Src0 = MI.getOperand(1);
3942  MachineOperand &Src1 = MI.getOperand(2);
3943 
3944  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3945  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3946 
3947  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
3948  MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3949  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
3950  MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3951 
3952  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
3953  MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3954  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
3955  MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3956 
3957  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3958 
3959  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3960  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3961  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
3962  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
3963  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3964  .addReg(DestSub0)
3965  .addImm(AMDGPU::sub0)
3966  .addReg(DestSub1)
3967  .addImm(AMDGPU::sub1);
3968  MI.eraseFromParent();
3969  return BB;
3970  }
3971  case AMDGPU::V_ADD_U64_PSEUDO:
3972  case AMDGPU::V_SUB_U64_PSEUDO: {
3973  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3974  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3975  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3976  const DebugLoc &DL = MI.getDebugLoc();
3977 
3978  bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
3979 
3980  MachineOperand &Dest = MI.getOperand(0);
3981  MachineOperand &Src0 = MI.getOperand(1);
3982  MachineOperand &Src1 = MI.getOperand(2);
3983 
3984  if (IsAdd && ST.hasLshlAddB64()) {
3985  auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
3986  Dest.getReg())
3987  .add(Src0)
3988  .addImm(0)
3989  .add(Src1);
3990  TII->legalizeOperands(*Add);
3991  MI.eraseFromParent();
3992  return BB;
3993  }
3994 
3995  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3996 
3997  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3998  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3999 
4000  Register CarryReg = MRI.createVirtualRegister(CarryRC);
4001  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4002 
4003  const TargetRegisterClass *Src0RC = Src0.isReg()
4004  ? MRI.getRegClass(Src0.getReg())
4005  : &AMDGPU::VReg_64RegClass;
4006  const TargetRegisterClass *Src1RC = Src1.isReg()
4007  ? MRI.getRegClass(Src1.getReg())
4008  : &AMDGPU::VReg_64RegClass;
4009 
4010  const TargetRegisterClass *Src0SubRC =
4011  TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
4012  const TargetRegisterClass *Src1SubRC =
4013  TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
4014 
4015  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4016  MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4017  MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4018  MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4019 
4020  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4021  MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4022  MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4023  MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4024 
4025  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4026  MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4027  .addReg(CarryReg, RegState::Define)
4028  .add(SrcReg0Sub0)
4029  .add(SrcReg1Sub0)
4030  .addImm(0); // clamp bit
4031 
4032  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4033  MachineInstr *HiHalf =
4034  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4035  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4036  .add(SrcReg0Sub1)
4037  .add(SrcReg1Sub1)
4038  .addReg(CarryReg, RegState::Kill)
4039  .addImm(0); // clamp bit
4040 
4041  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4042  .addReg(DestSub0)
4043  .addImm(AMDGPU::sub0)
4044  .addReg(DestSub1)
4045  .addImm(AMDGPU::sub1);
4046  TII->legalizeOperands(*LoHalf);
4047  TII->legalizeOperands(*HiHalf);
4048  MI.eraseFromParent();
4049  return BB;
4050  }
4051  case AMDGPU::S_ADD_CO_PSEUDO:
4052  case AMDGPU::S_SUB_CO_PSEUDO: {
4053  // This pseudo has a chance to be selected
4054  // only from uniform add/subcarry node. All the VGPR operands
4055  // therefore assumed to be splat vectors.
4056  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4057  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4058  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4060  const DebugLoc &DL = MI.getDebugLoc();
4061  MachineOperand &Dest = MI.getOperand(0);
4062  MachineOperand &CarryDest = MI.getOperand(1);
4063  MachineOperand &Src0 = MI.getOperand(2);
4064  MachineOperand &Src1 = MI.getOperand(3);
4065  MachineOperand &Src2 = MI.getOperand(4);
4066  unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4067  ? AMDGPU::S_ADDC_U32
4068  : AMDGPU::S_SUBB_U32;
4069  if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4070  Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4071  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4072  .addReg(Src0.getReg());
4073  Src0.setReg(RegOp0);
4074  }
4075  if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4076  Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4077  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4078  .addReg(Src1.getReg());
4079  Src1.setReg(RegOp1);
4080  }
4081  Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4082  if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4083  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4084  .addReg(Src2.getReg());
4085  Src2.setReg(RegOp2);
4086  }
4087 
4088  const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4089  unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4090  assert(WaveSize == 64 || WaveSize == 32);
4091 
4092  if (WaveSize == 64) {
4093  if (ST.hasScalarCompareEq64()) {
4094  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4095  .addReg(Src2.getReg())
4096  .addImm(0);
4097  } else {
4098  const TargetRegisterClass *SubRC =
4099  TRI->getSubRegClass(Src2RC, AMDGPU::sub0);
4100  MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4101  MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4102  MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4103  MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4104  Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4105 
4106  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4107  .add(Src2Sub0)
4108  .add(Src2Sub1);
4109 
4110  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4111  .addReg(Src2_32, RegState::Kill)
4112  .addImm(0);
4113  }
4114  } else {
4115  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
4116  .addReg(Src2.getReg())
4117  .addImm(0);
4118  }
4119 
4120  BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4121 
4122  unsigned SelOpc =
4123  (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4124 
4125  BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
4126  .addImm(-1)
4127  .addImm(0);
4128 
4129  MI.eraseFromParent();
4130  return BB;
4131  }
4132  case AMDGPU::SI_INIT_M0: {
4133  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
4134  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4135  .add(MI.getOperand(0));
4136  MI.eraseFromParent();
4137  return BB;
4138  }
4139  case AMDGPU::GET_GROUPSTATICSIZE: {
4140  assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4141  getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4142  DebugLoc DL = MI.getDebugLoc();
4143  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4144  .add(MI.getOperand(0))
4145  .addImm(MFI->getLDSSize());
4146  MI.eraseFromParent();
4147  return BB;
4148  }
4149  case AMDGPU::SI_INDIRECT_SRC_V1:
4150  case AMDGPU::SI_INDIRECT_SRC_V2:
4151  case AMDGPU::SI_INDIRECT_SRC_V4:
4152  case AMDGPU::SI_INDIRECT_SRC_V8:
4153  case AMDGPU::SI_INDIRECT_SRC_V16:
4154  case AMDGPU::SI_INDIRECT_SRC_V32:
4155  return emitIndirectSrc(MI, *BB, *getSubtarget());
4156  case AMDGPU::SI_INDIRECT_DST_V1:
4157  case AMDGPU::SI_INDIRECT_DST_V2:
4158  case AMDGPU::SI_INDIRECT_DST_V4:
4159  case AMDGPU::SI_INDIRECT_DST_V8:
4160  case AMDGPU::SI_INDIRECT_DST_V16:
4161  case AMDGPU::SI_INDIRECT_DST_V32:
4162  return emitIndirectDst(MI, *BB, *getSubtarget());
4163  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4164  case AMDGPU::SI_KILL_I1_PSEUDO:
4165  return splitKillBlock(MI, BB);
4166  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
4167  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4168  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4169  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4170 
4171  Register Dst = MI.getOperand(0).getReg();
4172  Register Src0 = MI.getOperand(1).getReg();
4173  Register Src1 = MI.getOperand(2).getReg();
4174  const DebugLoc &DL = MI.getDebugLoc();
4175  Register SrcCond = MI.getOperand(3).getReg();
4176 
4177  Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4178  Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4179  const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4180  Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
4181 
4182  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
4183  .addReg(SrcCond);
4184  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4185  .addImm(0)
4186  .addReg(Src0, 0, AMDGPU::sub0)
4187  .addImm(0)
4188  .addReg(Src1, 0, AMDGPU::sub0)
4189  .addReg(SrcCondCopy);
4190  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4191  .addImm(0)
4192  .addReg(Src0, 0, AMDGPU::sub1)
4193  .addImm(0)
4194  .addReg(Src1, 0, AMDGPU::sub1)
4195  .addReg(SrcCondCopy);
4196 
4197  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
4198  .addReg(DstLo)
4199  .addImm(AMDGPU::sub0)
4200  .addReg(DstHi)
4201  .addImm(AMDGPU::sub1);
4202  MI.eraseFromParent();
4203  return BB;
4204  }
4205  case AMDGPU::SI_BR_UNDEF: {
4206  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4207  const DebugLoc &DL = MI.getDebugLoc();
4208  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4209  .add(MI.getOperand(0));
4210  Br->getOperand(1).setIsUndef(true); // read undef SCC
4211  MI.eraseFromParent();
4212  return BB;
4213  }
4214  case AMDGPU::ADJCALLSTACKUP:
4215  case AMDGPU::ADJCALLSTACKDOWN: {
4217  MachineInstrBuilder MIB(*MF, &MI);
4218  MIB.ad