LLVM  6.0.0svn
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #endif
19 
20 #include "SIISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/ADT/APFloat.h"
31 #include "llvm/ADT/APInt.h"
32 #include "llvm/ADT/ArrayRef.h"
33 #include "llvm/ADT/BitVector.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/StringSwitch.h"
38 #include "llvm/ADT/Twine.h"
39 #include "llvm/CodeGen/Analysis.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
73 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/KnownBits.h"
78 #include <cassert>
79 #include <cmath>
80 #include <cstdint>
81 #include <iterator>
82 #include <tuple>
83 #include <utility>
84 #include <vector>
85 
86 using namespace llvm;
87 
88 #define DEBUG_TYPE "si-lower"
89 
90 STATISTIC(NumTailCalls, "Number of tail calls");
91 
93  "amdgpu-vgpr-index-mode",
94  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
95  cl::init(false));
96 
98  "amdgpu-frame-index-zero-bits",
99  cl::desc("High bits of frame index assumed to be zero"),
100  cl::init(5),
102 
103 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
104  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
105  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
106  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
107  return AMDGPU::SGPR0 + Reg;
108  }
109  }
110  llvm_unreachable("Cannot allocate sgpr");
111 }
112 
114  const SISubtarget &STI)
115  : AMDGPUTargetLowering(TM, STI) {
116  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
117  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
118 
119  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
120  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
121 
122  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
123  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
124  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
125 
126  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
127  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
128 
129  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
130  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
131 
132  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
133  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
134 
135  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
136  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
137 
138  if (Subtarget->has16BitInsts()) {
139  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
140  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
141  }
142 
143  if (Subtarget->hasVOP3PInsts()) {
144  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
145  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
146  }
147 
149 
150  // We need to custom lower vector stores from local memory
156 
162 
173 
177 
182 
188 
193 
196 
204 
209 
211 
215 
222 
225 
228 
229  // We only support LOAD/STORE and vector manipulation ops for vectors
230  // with > 4 elements.
233  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
234  switch (Op) {
235  case ISD::LOAD:
236  case ISD::STORE:
237  case ISD::BUILD_VECTOR:
238  case ISD::BITCAST:
244  break;
245  case ISD::CONCAT_VECTORS:
247  break;
248  default:
250  break;
251  }
252  }
253  }
254 
255  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
256  // is expanded to avoid having two separate loops in case the index is a VGPR.
257 
258  // Most operations are naturally 32-bit vector operations. We only support
259  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
260  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
263 
266 
269 
272  }
273 
278 
279  // Avoid stack access for these.
280  // TODO: Generalize to more vector types.
285 
286  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
287  // and output demarshalling
290 
291  // We can't return success/failure, only the old value,
292  // let LLVM add the comparison
295 
296  if (getSubtarget()->hasFlatAddressSpace()) {
299  }
300 
303 
304  // On SI this is s_memtime and s_memrealtime on VI.
308 
311 
316  }
317 
319 
324 
325  if (Subtarget->has16BitInsts()) {
327 
330 
333 
336 
339 
344 
347 
352 
354 
356 
358 
360 
365 
370 
371  // F16 - Constant Actions.
373 
374  // F16 - Load/Store Actions.
379 
380  // F16 - VOP1 Actions.
389 
390  // F16 - VOP2 Actions.
396 
397  // F16 - VOP3 Actions.
399  if (!Subtarget->hasFP16Denormals())
401  }
402 
403  if (Subtarget->hasVOP3PInsts()) {
404  for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
405  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
406  switch (Op) {
407  case ISD::LOAD:
408  case ISD::STORE:
409  case ISD::BUILD_VECTOR:
410  case ISD::BITCAST:
416  break;
417  case ISD::CONCAT_VECTORS:
419  break;
420  default:
422  break;
423  }
424  }
425  }
426 
427  // XXX - Do these do anything? Vector constants turn into build_vector.
430 
435 
440 
451 
462 
469 
470  // This isn't really legal, but this avoids the legalizer unrolling it (and
471  // allows matching fneg (fabs x) patterns)
473 
476 
481  } else {
484  }
485 
488  }
489 
513 
514  // All memory operations. Some folding on the pointer operand is done to help
515  // matching the constant offsets in the addressing modes.
533 
535 }
536 
538  return static_cast<const SISubtarget *>(Subtarget);
539 }
540 
541 //===----------------------------------------------------------------------===//
542 // TargetLowering queries
543 //===----------------------------------------------------------------------===//
544 
546  // SI has some legal vector types, but no legal vector operations. Say no
547  // shuffles are legal in order to prefer scalarizing some vector operations.
548  return false;
549 }
550 
552  const CallInst &CI,
553  unsigned IntrID) const {
554  switch (IntrID) {
555  case Intrinsic::amdgcn_atomic_inc:
556  case Intrinsic::amdgcn_atomic_dec: {
558  Info.memVT = MVT::getVT(CI.getType());
559  Info.ptrVal = CI.getOperand(0);
560  Info.align = 0;
561 
562  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
563  Info.vol = !Vol || !Vol->isZero();
564  Info.readMem = true;
565  Info.writeMem = true;
566  return true;
567  }
568  default:
569  return false;
570  }
571 }
572 
575  Type *&AccessTy) const {
576  switch (II->getIntrinsicID()) {
577  case Intrinsic::amdgcn_atomic_inc:
578  case Intrinsic::amdgcn_atomic_dec: {
579  Value *Ptr = II->getArgOperand(0);
580  AccessTy = II->getType();
581  Ops.push_back(Ptr);
582  return true;
583  }
584  default:
585  return false;
586  }
587 }
588 
589 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
590  if (!Subtarget->hasFlatInstOffsets()) {
591  // Flat instructions do not have offsets, and only have the register
592  // address.
593  return AM.BaseOffs == 0 && AM.Scale == 0;
594  }
595 
596  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
597  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
598 
599  // Just r + i
600  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
601 }
602 
603 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
605  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
606 
608  // Assume the we will use FLAT for all global memory accesses
609  // on VI.
610  // FIXME: This assumption is currently wrong. On VI we still use
611  // MUBUF instructions for the r + i addressing mode. As currently
612  // implemented, the MUBUF instructions only work on buffer < 4GB.
613  // It may be possible to support > 4GB buffers with MUBUF instructions,
614  // by setting the stride value in the resource descriptor which would
615  // increase the size limit to (stride * 4GB). However, this is risky,
616  // because it has never been validated.
617  return isLegalFlatAddressingMode(AM);
618  }
619 
620  return isLegalMUBUFAddressingMode(AM);
621 }
622 
623 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
624  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
625  // additionally can do r + r + i with addr64. 32-bit has more addressing
626  // mode options. Depending on the resource constant, it can also do
627  // (i64 r0) + (i32 r1) * (i14 i).
628  //
629  // Private arrays end up using a scratch buffer most of the time, so also
630  // assume those use MUBUF instructions. Scratch loads / stores are currently
631  // implemented as mubuf instructions with offen bit set, so slightly
632  // different than the normal addr64.
633  if (!isUInt<12>(AM.BaseOffs))
634  return false;
635 
636  // FIXME: Since we can split immediate into soffset and immediate offset,
637  // would it make sense to allow any immediate?
638 
639  switch (AM.Scale) {
640  case 0: // r + i or just i, depending on HasBaseReg.
641  return true;
642  case 1:
643  return true; // We have r + r or r + i.
644  case 2:
645  if (AM.HasBaseReg) {
646  // Reject 2 * r + r.
647  return false;
648  }
649 
650  // Allow 2 * r as r + r
651  // Or 2 * r + i is allowed as r + r + i.
652  return true;
653  default: // Don't allow n * r
654  return false;
655  }
656 }
657 
659  const AddrMode &AM, Type *Ty,
660  unsigned AS, Instruction *I) const {
661  // No global is ever allowed as a base.
662  if (AM.BaseGV)
663  return false;
664 
665  if (AS == AMDGPUASI.GLOBAL_ADDRESS)
666  return isLegalGlobalAddressingMode(AM);
667 
668  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
669  // If the offset isn't a multiple of 4, it probably isn't going to be
670  // correctly aligned.
671  // FIXME: Can we get the real alignment here?
672  if (AM.BaseOffs % 4 != 0)
673  return isLegalMUBUFAddressingMode(AM);
674 
675  // There are no SMRD extloads, so if we have to do a small type access we
676  // will use a MUBUF load.
677  // FIXME?: We also need to do this if unaligned, but we don't know the
678  // alignment here.
679  if (DL.getTypeStoreSize(Ty) < 4)
680  return isLegalGlobalAddressingMode(AM);
681 
683  // SMRD instructions have an 8-bit, dword offset on SI.
684  if (!isUInt<8>(AM.BaseOffs / 4))
685  return false;
687  // On CI+, this can also be a 32-bit literal constant offset. If it fits
688  // in 8-bits, it can use a smaller encoding.
689  if (!isUInt<32>(AM.BaseOffs / 4))
690  return false;
692  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
693  if (!isUInt<20>(AM.BaseOffs))
694  return false;
695  } else
696  llvm_unreachable("unhandled generation");
697 
698  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
699  return true;
700 
701  if (AM.Scale == 1 && AM.HasBaseReg)
702  return true;
703 
704  return false;
705 
706  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
707  return isLegalMUBUFAddressingMode(AM);
708  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
709  AS == AMDGPUASI.REGION_ADDRESS) {
710  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
711  // field.
712  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
713  // an 8-bit dword offset but we don't know the alignment here.
714  if (!isUInt<16>(AM.BaseOffs))
715  return false;
716 
717  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
718  return true;
719 
720  if (AM.Scale == 1 && AM.HasBaseReg)
721  return true;
722 
723  return false;
724  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
726  // For an unknown address space, this usually means that this is for some
727  // reason being used for pure arithmetic, and not based on some addressing
728  // computation. We don't have instructions that compute pointers with any
729  // addressing modes, so treat them as having no offset like flat
730  // instructions.
731  return isLegalFlatAddressingMode(AM);
732  } else {
733  llvm_unreachable("unhandled address space");
734  }
735 }
736 
738  const SelectionDAG &DAG) const {
739  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
740  return (MemVT.getSizeInBits() <= 4 * 32);
741  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
742  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
743  return (MemVT.getSizeInBits() <= MaxPrivateBits);
744  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
745  return (MemVT.getSizeInBits() <= 2 * 32);
746  }
747  return true;
748 }
749 
751  unsigned AddrSpace,
752  unsigned Align,
753  bool *IsFast) const {
754  if (IsFast)
755  *IsFast = false;
756 
757  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
758  // which isn't a simple VT.
759  // Until MVT is extended to handle this, simply check for the size and
760  // rely on the condition below: allow accesses if the size is a multiple of 4.
761  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
762  VT.getStoreSize() > 16)) {
763  return false;
764  }
765 
766  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
767  AddrSpace == AMDGPUASI.REGION_ADDRESS) {
768  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
769  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
770  // with adjacent offsets.
771  bool AlignedBy4 = (Align % 4 == 0);
772  if (IsFast)
773  *IsFast = AlignedBy4;
774 
775  return AlignedBy4;
776  }
777 
778  // FIXME: We have to be conservative here and assume that flat operations
779  // will access scratch. If we had access to the IR function, then we
780  // could determine if any private memory was used in the function.
782  (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
783  AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
784  return false;
785  }
786 
788  // If we have an uniform constant load, it still requires using a slow
789  // buffer instruction if unaligned.
790  if (IsFast) {
791  *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
792  (Align % 4 == 0) : true;
793  }
794 
795  return true;
796  }
797 
798  // Smaller than dword value must be aligned.
799  if (VT.bitsLT(MVT::i32))
800  return false;
801 
802  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
803  // byte-address are ignored, thus forcing Dword alignment.
804  // This applies to private, global, and constant memory.
805  if (IsFast)
806  *IsFast = true;
807 
808  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
809 }
810 
811 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
812  unsigned SrcAlign, bool IsMemset,
813  bool ZeroMemset,
814  bool MemcpyStrSrc,
815  MachineFunction &MF) const {
816  // FIXME: Should account for address space here.
817 
818  // The default fallback uses the private pointer size as a guess for a type to
819  // use. Make sure we switch these to 64-bit accesses.
820 
821  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
822  return MVT::v4i32;
823 
824  if (Size >= 8 && DstAlign >= 4)
825  return MVT::v2i32;
826 
827  // Use the default.
828  return MVT::Other;
829 }
830 
831 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
832  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
833  AS == AMDGPUASI.FLAT_ADDRESS ||
834  AS == AMDGPUASI.CONSTANT_ADDRESS;
835 }
836 
838  unsigned DestAS) const {
839  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
841 }
842 
844  const MemSDNode *MemNode = cast<MemSDNode>(N);
845  const Value *Ptr = MemNode->getMemOperand()->getValue();
846  const Instruction *I = dyn_cast<Instruction>(Ptr);
847  return I && I->getMetadata("amdgpu.noclobber");
848 }
849 
851  unsigned DestAS) const {
852  // Flat -> private/local is a simple truncate.
853  // Flat -> global is no-op
854  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
855  return true;
856 
857  return isNoopAddrSpaceCast(SrcAS, DestAS);
858 }
859 
861  const MemSDNode *MemNode = cast<MemSDNode>(N);
862 
863  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
864 }
865 
868  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
869  return TypeSplitVector;
870 
872 }
873 
875  Type *Ty) const {
876  // FIXME: Could be smarter if called for vector constants.
877  return true;
878 }
879 
881  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
882  switch (Op) {
883  case ISD::LOAD:
884  case ISD::STORE:
885 
886  // These operations are done with 32-bit instructions anyway.
887  case ISD::AND:
888  case ISD::OR:
889  case ISD::XOR:
890  case ISD::SELECT:
891  // TODO: Extensions?
892  return true;
893  default:
894  return false;
895  }
896  }
897 
898  // SimplifySetCC uses this function to determine whether or not it should
899  // create setcc with i1 operands. We don't have instructions for i1 setcc.
900  if (VT == MVT::i1 && Op == ISD::SETCC)
901  return false;
902 
904 }
905 
906 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
907  const SDLoc &SL,
908  SDValue Chain,
909  uint64_t Offset) const {
910  const DataLayout &DL = DAG.getDataLayout();
913 
914  const ArgDescriptor *InputPtrReg;
915  const TargetRegisterClass *RC;
916 
917  std::tie(InputPtrReg, RC)
919 
922  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
923  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
924 
925  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
926  DAG.getConstant(Offset, SL, PtrVT));
927 }
928 
929 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
930  const SDLoc &SL) const {
932  uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
933  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
934 }
935 
936 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
937  const SDLoc &SL, SDValue Val,
938  bool Signed,
939  const ISD::InputArg *Arg) const {
940  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
941  VT.bitsLT(MemVT)) {
942  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
943  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
944  }
945 
946  if (MemVT.isFloatingPoint())
947  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
948  else if (Signed)
949  Val = DAG.getSExtOrTrunc(Val, SL, VT);
950  else
951  Val = DAG.getZExtOrTrunc(Val, SL, VT);
952 
953  return Val;
954 }
955 
956 SDValue SITargetLowering::lowerKernargMemParameter(
957  SelectionDAG &DAG, EVT VT, EVT MemVT,
958  const SDLoc &SL, SDValue Chain,
959  uint64_t Offset, bool Signed,
960  const ISD::InputArg *Arg) const {
961  const DataLayout &DL = DAG.getDataLayout();
962  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
964  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
965 
966  unsigned Align = DL.getABITypeAlignment(Ty);
967 
968  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
969  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
973 
974  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
975  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
976 }
977 
978 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
979  const SDLoc &SL, SDValue Chain,
980  const ISD::InputArg &Arg) const {
982  MachineFrameInfo &MFI = MF.getFrameInfo();
983 
984  if (Arg.Flags.isByVal()) {
985  unsigned Size = Arg.Flags.getByValSize();
986  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
987  return DAG.getFrameIndex(FrameIdx, MVT::i32);
988  }
989 
990  unsigned ArgOffset = VA.getLocMemOffset();
991  unsigned ArgSize = VA.getValVT().getStoreSize();
992 
993  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
994 
995  // Create load nodes to retrieve arguments from the stack.
996  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
997  SDValue ArgValue;
998 
999  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1001  MVT MemVT = VA.getValVT();
1002 
1003  switch (VA.getLocInfo()) {
1004  default:
1005  break;
1006  case CCValAssign::BCvt:
1007  MemVT = VA.getLocVT();
1008  break;
1009  case CCValAssign::SExt:
1010  ExtType = ISD::SEXTLOAD;
1011  break;
1012  case CCValAssign::ZExt:
1013  ExtType = ISD::ZEXTLOAD;
1014  break;
1015  case CCValAssign::AExt:
1016  ExtType = ISD::EXTLOAD;
1017  break;
1018  }
1019 
1020  ArgValue = DAG.getExtLoad(
1021  ExtType, SL, VA.getLocVT(), Chain, FIN,
1023  MemVT);
1024  return ArgValue;
1025 }
1026 
1027 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1028  const SIMachineFunctionInfo &MFI,
1029  EVT VT,
1031  const ArgDescriptor *Reg;
1032  const TargetRegisterClass *RC;
1033 
1034  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1035  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1036 }
1037 
1039  CallingConv::ID CallConv,
1041  BitVector &Skipped,
1042  FunctionType *FType,
1043  SIMachineFunctionInfo *Info) {
1044  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1045  const ISD::InputArg &Arg = Ins[I];
1046 
1047  // First check if it's a PS input addr.
1048  if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
1049  !Arg.Flags.isByVal() && PSInputNum <= 15) {
1050 
1051  if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
1052  // We can safely skip PS inputs.
1053  Skipped.set(I);
1054  ++PSInputNum;
1055  continue;
1056  }
1057 
1058  Info->markPSInputAllocated(PSInputNum);
1059  if (Arg.Used)
1060  Info->markPSInputEnabled(PSInputNum);
1061 
1062  ++PSInputNum;
1063  }
1064 
1065  // Second split vertices into their elements.
1066  if (Arg.VT.isVector()) {
1067  ISD::InputArg NewArg = Arg;
1068  NewArg.Flags.setSplit();
1069  NewArg.VT = Arg.VT.getVectorElementType();
1070 
1071  // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
1072  // three or five element vertex only needs three or five registers,
1073  // NOT four or eight.
1074  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1075  unsigned NumElements = ParamType->getVectorNumElements();
1076 
1077  for (unsigned J = 0; J != NumElements; ++J) {
1078  Splits.push_back(NewArg);
1079  NewArg.PartOffset += NewArg.VT.getStoreSize();
1080  }
1081  } else {
1082  Splits.push_back(Arg);
1083  }
1084  }
1085 }
1086 
1087 // Allocate special inputs passed in VGPRs.
1089  MachineFunction &MF,
1090  const SIRegisterInfo &TRI,
1091  SIMachineFunctionInfo &Info) {
1092  if (Info.hasWorkItemIDX()) {
1093  unsigned Reg = AMDGPU::VGPR0;
1094  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1095 
1096  CCInfo.AllocateReg(Reg);
1098  }
1099 
1100  if (Info.hasWorkItemIDY()) {
1101  unsigned Reg = AMDGPU::VGPR1;
1102  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1103 
1104  CCInfo.AllocateReg(Reg);
1106  }
1107 
1108  if (Info.hasWorkItemIDZ()) {
1109  unsigned Reg = AMDGPU::VGPR2;
1110  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1111 
1112  CCInfo.AllocateReg(Reg);
1114  }
1115 }
1116 
1117 // Try to allocate a VGPR at the end of the argument list, or if no argument
1118 // VGPRs are left allocating a stack slot.
1120  ArrayRef<MCPhysReg> ArgVGPRs
1121  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1122  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1123  if (RegIdx == ArgVGPRs.size()) {
1124  // Spill to stack required.
1125  int64_t Offset = CCInfo.AllocateStack(4, 4);
1126 
1127  return ArgDescriptor::createStack(Offset);
1128  }
1129 
1130  unsigned Reg = ArgVGPRs[RegIdx];
1131  Reg = CCInfo.AllocateReg(Reg);
1132  assert(Reg != AMDGPU::NoRegister);
1133 
1134  MachineFunction &MF = CCInfo.getMachineFunction();
1135  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1136  return ArgDescriptor::createRegister(Reg);
1137 }
1138 
1140  const TargetRegisterClass *RC,
1141  unsigned NumArgRegs) {
1142  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1143  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1144  if (RegIdx == ArgSGPRs.size())
1145  report_fatal_error("ran out of SGPRs for arguments");
1146 
1147  unsigned Reg = ArgSGPRs[RegIdx];
1148  Reg = CCInfo.AllocateReg(Reg);
1149  assert(Reg != AMDGPU::NoRegister);
1150 
1151  MachineFunction &MF = CCInfo.getMachineFunction();
1152  MF.addLiveIn(Reg, RC);
1153  return ArgDescriptor::createRegister(Reg);
1154 }
1155 
1157  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1158 }
1159 
1161  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1162 }
1163 
1165  MachineFunction &MF,
1166  const SIRegisterInfo &TRI,
1167  SIMachineFunctionInfo &Info) {
1168  if (Info.hasWorkItemIDX())
1169  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1170 
1171  if (Info.hasWorkItemIDY())
1172  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1173 
1174  if (Info.hasWorkItemIDZ())
1175  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1176 }
1177 
1179  MachineFunction &MF,
1180  const SIRegisterInfo &TRI,
1181  SIMachineFunctionInfo &Info) {
1182  auto &ArgInfo = Info.getArgInfo();
1183 
1184  // TODO: Unify handling with private memory pointers.
1185 
1186  if (Info.hasDispatchPtr())
1187  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1188 
1189  if (Info.hasQueuePtr())
1190  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1191 
1192  if (Info.hasKernargSegmentPtr())
1193  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1194 
1195  if (Info.hasDispatchID())
1196  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1197 
1198  // flat_scratch_init is not applicable for non-kernel functions.
1199 
1200  if (Info.hasWorkGroupIDX())
1201  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1202 
1203  if (Info.hasWorkGroupIDY())
1204  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1205 
1206  if (Info.hasWorkGroupIDZ())
1207  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1208 
1209  if (Info.hasImplicitArgPtr())
1210  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1211 }
1212 
1213 // Allocate special inputs passed in user SGPRs.
1214 static void allocateHSAUserSGPRs(CCState &CCInfo,
1215  MachineFunction &MF,
1216  const SIRegisterInfo &TRI,
1217  SIMachineFunctionInfo &Info) {
1218  if (Info.hasImplicitBufferPtr()) {
1219  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1220  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1221  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1222  }
1223 
1224  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1225  if (Info.hasPrivateSegmentBuffer()) {
1226  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1227  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1228  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1229  }
1230 
1231  if (Info.hasDispatchPtr()) {
1232  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1233  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1234  CCInfo.AllocateReg(DispatchPtrReg);
1235  }
1236 
1237  if (Info.hasQueuePtr()) {
1238  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1239  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1240  CCInfo.AllocateReg(QueuePtrReg);
1241  }
1242 
1243  if (Info.hasKernargSegmentPtr()) {
1244  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1245  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1246  CCInfo.AllocateReg(InputPtrReg);
1247  }
1248 
1249  if (Info.hasDispatchID()) {
1250  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1251  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1252  CCInfo.AllocateReg(DispatchIDReg);
1253  }
1254 
1255  if (Info.hasFlatScratchInit()) {
1256  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1257  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1258  CCInfo.AllocateReg(FlatScratchInitReg);
1259  }
1260 
1261  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1262  // these from the dispatch pointer.
1263 }
1264 
1265 // Allocate special input registers that are initialized per-wave.
1266 static void allocateSystemSGPRs(CCState &CCInfo,
1267  MachineFunction &MF,
1268  SIMachineFunctionInfo &Info,
1269  CallingConv::ID CallConv,
1270  bool IsShader) {
1271  if (Info.hasWorkGroupIDX()) {
1272  unsigned Reg = Info.addWorkGroupIDX();
1273  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1274  CCInfo.AllocateReg(Reg);
1275  }
1276 
1277  if (Info.hasWorkGroupIDY()) {
1278  unsigned Reg = Info.addWorkGroupIDY();
1279  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1280  CCInfo.AllocateReg(Reg);
1281  }
1282 
1283  if (Info.hasWorkGroupIDZ()) {
1284  unsigned Reg = Info.addWorkGroupIDZ();
1285  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1286  CCInfo.AllocateReg(Reg);
1287  }
1288 
1289  if (Info.hasWorkGroupInfo()) {
1290  unsigned Reg = Info.addWorkGroupInfo();
1291  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1292  CCInfo.AllocateReg(Reg);
1293  }
1294 
1295  if (Info.hasPrivateSegmentWaveByteOffset()) {
1296  // Scratch wave offset passed in system SGPR.
1297  unsigned PrivateSegmentWaveByteOffsetReg;
1298 
1299  if (IsShader) {
1300  PrivateSegmentWaveByteOffsetReg =
1302 
1303  // This is true if the scratch wave byte offset doesn't have a fixed
1304  // location.
1305  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1306  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1307  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1308  }
1309  } else
1310  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1311 
1312  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1313  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1314  }
1315 }
1316 
1318  MachineFunction &MF,
1319  const SIRegisterInfo &TRI,
1320  SIMachineFunctionInfo &Info) {
1321  // Now that we've figured out where the scratch register inputs are, see if
1322  // should reserve the arguments and use them directly.
1323  MachineFrameInfo &MFI = MF.getFrameInfo();
1324  bool HasStackObjects = MFI.hasStackObjects();
1325 
1326  // Record that we know we have non-spill stack objects so we don't need to
1327  // check all stack objects later.
1328  if (HasStackObjects)
1329  Info.setHasNonSpillStackObjects(true);
1330 
1331  // Everything live out of a block is spilled with fast regalloc, so it's
1332  // almost certain that spilling will be required.
1333  if (TM.getOptLevel() == CodeGenOpt::None)
1334  HasStackObjects = true;
1335 
1336  // For now assume stack access is needed in any callee functions, so we need
1337  // the scratch registers to pass in.
1338  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1339 
1340  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1341  if (ST.isAmdCodeObjectV2(MF)) {
1342  if (RequiresStackAccess) {
1343  // If we have stack objects, we unquestionably need the private buffer
1344  // resource. For the Code Object V2 ABI, this will be the first 4 user
1345  // SGPR inputs. We can reserve those and use them directly.
1346 
1347  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1349  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1350 
1351  if (MFI.hasCalls()) {
1352  // If we have calls, we need to keep the frame register in a register
1353  // that won't be clobbered by a call, so ensure it is copied somewhere.
1354 
1355  // This is not a problem for the scratch wave offset, because the same
1356  // registers are reserved in all functions.
1357 
1358  // FIXME: Nothing is really ensuring this is a call preserved register,
1359  // it's just selected from the end so it happens to be.
1360  unsigned ReservedOffsetReg
1362  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1363  } else {
1364  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1366  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1367  }
1368  } else {
1369  unsigned ReservedBufferReg
1371  unsigned ReservedOffsetReg
1373 
1374  // We tentatively reserve the last registers (skipping the last two
1375  // which may contain VCC). After register allocation, we'll replace
1376  // these with the ones immediately after those which were really
1377  // allocated. In the prologue copies will be inserted from the argument
1378  // to these reserved registers.
1379  Info.setScratchRSrcReg(ReservedBufferReg);
1380  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1381  }
1382  } else {
1383  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1384 
1385  // Without HSA, relocations are used for the scratch pointer and the
1386  // buffer resource setup is always inserted in the prologue. Scratch wave
1387  // offset is still in an input SGPR.
1388  Info.setScratchRSrcReg(ReservedBufferReg);
1389 
1390  if (HasStackObjects && !MFI.hasCalls()) {
1391  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1393  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1394  } else {
1395  unsigned ReservedOffsetReg
1397  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1398  }
1399  }
1400 }
1401 
1404  return !Info->isEntryFunction();
1405 }
1406 
1408 
1409 }
1410 
1412  MachineBasicBlock *Entry,
1413  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1414  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1415 
1416  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1417  if (!IStart)
1418  return;
1419 
1421  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1422  MachineBasicBlock::iterator MBBI = Entry->begin();
1423  for (const MCPhysReg *I = IStart; *I; ++I) {
1424  const TargetRegisterClass *RC = nullptr;
1425  if (AMDGPU::SReg_64RegClass.contains(*I))
1426  RC = &AMDGPU::SGPR_64RegClass;
1427  else if (AMDGPU::SReg_32RegClass.contains(*I))
1428  RC = &AMDGPU::SGPR_32RegClass;
1429  else
1430  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1431 
1432  unsigned NewVR = MRI->createVirtualRegister(RC);
1433  // Create copy from CSR to a virtual register.
1434  Entry->addLiveIn(*I);
1435  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1436  .addReg(*I);
1437 
1438  // Insert the copy-back instructions right before the terminator.
1439  for (auto *Exit : Exits)
1440  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1441  TII->get(TargetOpcode::COPY), *I)
1442  .addReg(NewVR);
1443  }
1444 }
1445 
1447  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1448  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1449  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1450  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1451 
1452  MachineFunction &MF = DAG.getMachineFunction();
1453  FunctionType *FType = MF.getFunction()->getFunctionType();
1455  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1456 
1457  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1458  const Function *Fn = MF.getFunction();
1459  DiagnosticInfoUnsupported NoGraphicsHSA(
1460  *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1461  DAG.getContext()->diagnose(NoGraphicsHSA);
1462  return DAG.getEntryNode();
1463  }
1464 
1465  // Create stack objects that are used for emitting debugger prologue if
1466  // "amdgpu-debugger-emit-prologue" attribute was specified.
1467  if (ST.debuggerEmitPrologue())
1468  createDebuggerPrologueStackObjects(MF);
1469 
1472  BitVector Skipped(Ins.size());
1473  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1474  *DAG.getContext());
1475 
1476  bool IsShader = AMDGPU::isShader(CallConv);
1477  bool IsKernel = AMDGPU::isKernel(CallConv);
1478  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1479 
1480  if (!IsEntryFunc) {
1481  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1482  // this when allocating argument fixed offsets.
1483  CCInfo.AllocateStack(4, 4);
1484  }
1485 
1486  if (IsShader) {
1487  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1488 
1489  // At least one interpolation mode must be enabled or else the GPU will
1490  // hang.
1491  //
1492  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1493  // set PSInputAddr, the user wants to enable some bits after the compilation
1494  // based on run-time states. Since we can't know what the final PSInputEna
1495  // will look like, so we shouldn't do anything here and the user should take
1496  // responsibility for the correct programming.
1497  //
1498  // Otherwise, the following restrictions apply:
1499  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1500  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1501  // enabled too.
1502  if (CallConv == CallingConv::AMDGPU_PS) {
1503  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1504  ((Info->getPSInputAddr() & 0xF) == 0 &&
1505  Info->isPSInputAllocated(11))) {
1506  CCInfo.AllocateReg(AMDGPU::VGPR0);
1507  CCInfo.AllocateReg(AMDGPU::VGPR1);
1508  Info->markPSInputAllocated(0);
1509  Info->markPSInputEnabled(0);
1510  }
1511  if (Subtarget->isAmdPalOS()) {
1512  // For isAmdPalOS, the user does not enable some bits after compilation
1513  // based on run-time states; the register values being generated here are
1514  // the final ones set in hardware. Therefore we need to apply the
1515  // workaround to PSInputAddr and PSInputEnable together. (The case where
1516  // a bit is set in PSInputAddr but not PSInputEnable is where the
1517  // frontend set up an input arg for a particular interpolation mode, but
1518  // nothing uses that input arg. Really we should have an earlier pass
1519  // that removes such an arg.)
1520  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1521  if ((PsInputBits & 0x7F) == 0 ||
1522  ((PsInputBits & 0xF) == 0 &&
1523  (PsInputBits >> 11 & 1)))
1524  Info->markPSInputEnabled(
1526  }
1527  }
1528 
1529  assert(!Info->hasDispatchPtr() &&
1530  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1531  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1532  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1533  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1534  !Info->hasWorkItemIDZ());
1535  } else if (IsKernel) {
1536  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1537  } else {
1538  Splits.append(Ins.begin(), Ins.end());
1539  }
1540 
1541  if (IsEntryFunc) {
1542  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1543  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1544  }
1545 
1546  if (IsKernel) {
1547  analyzeFormalArgumentsCompute(CCInfo, Ins);
1548  } else {
1549  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1550  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1551  }
1552 
1553  SmallVector<SDValue, 16> Chains;
1554 
1555  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1556  const ISD::InputArg &Arg = Ins[i];
1557  if (Skipped[i]) {
1558  InVals.push_back(DAG.getUNDEF(Arg.VT));
1559  continue;
1560  }
1561 
1562  CCValAssign &VA = ArgLocs[ArgIdx++];
1563  MVT VT = VA.getLocVT();
1564 
1565  if (IsEntryFunc && VA.isMemLoc()) {
1566  VT = Ins[i].VT;
1567  EVT MemVT = VA.getLocVT();
1568 
1569  const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
1570  VA.getLocMemOffset();
1571  Info->setABIArgOffset(Offset + MemVT.getStoreSize());
1572 
1573  // The first 36 bytes of the input buffer contains information about
1574  // thread group and global sizes.
1575  SDValue Arg = lowerKernargMemParameter(
1576  DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
1577  Chains.push_back(Arg.getValue(1));
1578 
1579  auto *ParamTy =
1580  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1582  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1583  // On SI local pointers are just offsets into LDS, so they are always
1584  // less than 16-bits. On CI and newer they could potentially be
1585  // real pointers, so we can't guarantee their size.
1586  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1587  DAG.getValueType(MVT::i16));
1588  }
1589 
1590  InVals.push_back(Arg);
1591  continue;
1592  } else if (!IsEntryFunc && VA.isMemLoc()) {
1593  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1594  InVals.push_back(Val);
1595  if (!Arg.Flags.isByVal())
1596  Chains.push_back(Val.getValue(1));
1597  continue;
1598  }
1599 
1600  assert(VA.isRegLoc() && "Parameter must be in a register!");
1601 
1602  unsigned Reg = VA.getLocReg();
1603  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1604  EVT ValVT = VA.getValVT();
1605 
1606  Reg = MF.addLiveIn(Reg, RC);
1607  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1608 
1609  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1610  // The return object should be reasonably addressable.
1611 
1612  // FIXME: This helps when the return is a real sret. If it is a
1613  // automatically inserted sret (i.e. CanLowerReturn returns false), an
1614  // extra copy is inserted in SelectionDAGBuilder which obscures this.
1615  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1616  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1617  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1618  }
1619 
1620  // If this is an 8 or 16-bit value, it is really passed promoted
1621  // to 32 bits. Insert an assert[sz]ext to capture this, then
1622  // truncate to the right size.
1623  switch (VA.getLocInfo()) {
1624  case CCValAssign::Full:
1625  break;
1626  case CCValAssign::BCvt:
1627  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1628  break;
1629  case CCValAssign::SExt:
1630  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1631  DAG.getValueType(ValVT));
1632  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1633  break;
1634  case CCValAssign::ZExt:
1635  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1636  DAG.getValueType(ValVT));
1637  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1638  break;
1639  case CCValAssign::AExt:
1640  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1641  break;
1642  default:
1643  llvm_unreachable("Unknown loc info!");
1644  }
1645 
1646  if (IsShader && Arg.VT.isVector()) {
1647  // Build a vector from the registers
1648  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1649  unsigned NumElements = ParamType->getVectorNumElements();
1650 
1652  Regs.push_back(Val);
1653  for (unsigned j = 1; j != NumElements; ++j) {
1654  Reg = ArgLocs[ArgIdx++].getLocReg();
1655  Reg = MF.addLiveIn(Reg, RC);
1656 
1657  SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1658  Regs.push_back(Copy);
1659  }
1660 
1661  // Fill up the missing vector elements
1662  NumElements = Arg.VT.getVectorNumElements() - NumElements;
1663  Regs.append(NumElements, DAG.getUNDEF(VT));
1664 
1665  InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
1666  continue;
1667  }
1668 
1669  InVals.push_back(Val);
1670  }
1671 
1672  if (!IsEntryFunc) {
1673  // Special inputs come after user arguments.
1674  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1675  }
1676 
1677  // Start adding system SGPRs.
1678  if (IsEntryFunc) {
1679  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1680  } else {
1681  CCInfo.AllocateReg(Info->getScratchRSrcReg());
1682  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1683  CCInfo.AllocateReg(Info->getFrameOffsetReg());
1684  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1685  }
1686 
1687  auto &ArgUsageInfo =
1689  ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
1690 
1691  unsigned StackArgSize = CCInfo.getNextStackOffset();
1692  Info->setBytesInStackArgArea(StackArgSize);
1693 
1694  return Chains.empty() ? Chain :
1695  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1696 }
1697 
1698 // TODO: If return values can't fit in registers, we should return as many as
1699 // possible in registers before passing on stack.
1701  CallingConv::ID CallConv,
1702  MachineFunction &MF, bool IsVarArg,
1703  const SmallVectorImpl<ISD::OutputArg> &Outs,
1704  LLVMContext &Context) const {
1705  // Replacing returns with sret/stack usage doesn't make sense for shaders.
1706  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1707  // for shaders. Vector types should be explicitly handled by CC.
1708  if (AMDGPU::isEntryFunctionCC(CallConv))
1709  return true;
1710 
1712  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1713  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
1714 }
1715 
1716 SDValue
1718  bool isVarArg,
1719  const SmallVectorImpl<ISD::OutputArg> &Outs,
1720  const SmallVectorImpl<SDValue> &OutVals,
1721  const SDLoc &DL, SelectionDAG &DAG) const {
1722  MachineFunction &MF = DAG.getMachineFunction();
1724 
1725  if (AMDGPU::isKernel(CallConv)) {
1726  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1727  OutVals, DL, DAG);
1728  }
1729 
1730  bool IsShader = AMDGPU::isShader(CallConv);
1731 
1732  Info->setIfReturnsVoid(Outs.size() == 0);
1733  bool IsWaveEnd = Info->returnsVoid() && IsShader;
1734 
1736  SmallVector<SDValue, 48> SplitVals;
1737 
1738  // Split vectors into their elements.
1739  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1740  const ISD::OutputArg &Out = Outs[i];
1741 
1742  if (IsShader && Out.VT.isVector()) {
1743  MVT VT = Out.VT.getVectorElementType();
1744  ISD::OutputArg NewOut = Out;
1745  NewOut.Flags.setSplit();
1746  NewOut.VT = VT;
1747 
1748  // We want the original number of vector elements here, e.g.
1749  // three or five, not four or eight.
1750  unsigned NumElements = Out.ArgVT.getVectorNumElements();
1751 
1752  for (unsigned j = 0; j != NumElements; ++j) {
1753  SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1754  DAG.getConstant(j, DL, MVT::i32));
1755  SplitVals.push_back(Elem);
1756  Splits.push_back(NewOut);
1757  NewOut.PartOffset += NewOut.VT.getStoreSize();
1758  }
1759  } else {
1760  SplitVals.push_back(OutVals[i]);
1761  Splits.push_back(Out);
1762  }
1763  }
1764 
1765  // CCValAssign - represent the assignment of the return value to a location.
1767 
1768  // CCState - Info about the registers and stack slots.
1769  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1770  *DAG.getContext());
1771 
1772  // Analyze outgoing return values.
1773  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
1774 
1775  SDValue Flag;
1776  SmallVector<SDValue, 48> RetOps;
1777  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1778 
1779  // Add return address for callable functions.
1780  if (!Info->isEntryFunction()) {
1781  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1782  SDValue ReturnAddrReg = CreateLiveInRegister(
1783  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1784 
1785  // FIXME: Should be able to use a vreg here, but need a way to prevent it
1786  // from being allcoated to a CSR.
1787 
1788  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1789  MVT::i64);
1790 
1791  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1792  Flag = Chain.getValue(1);
1793 
1794  RetOps.push_back(PhysReturnAddrReg);
1795  }
1796 
1797  // Copy the result values into the output registers.
1798  for (unsigned i = 0, realRVLocIdx = 0;
1799  i != RVLocs.size();
1800  ++i, ++realRVLocIdx) {
1801  CCValAssign &VA = RVLocs[i];
1802  assert(VA.isRegLoc() && "Can only return in registers!");
1803  // TODO: Partially return in registers if return values don't fit.
1804 
1805  SDValue Arg = SplitVals[realRVLocIdx];
1806 
1807  // Copied from other backends.
1808  switch (VA.getLocInfo()) {
1809  case CCValAssign::Full:
1810  break;
1811  case CCValAssign::BCvt:
1812  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1813  break;
1814  case CCValAssign::SExt:
1815  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
1816  break;
1817  case CCValAssign::ZExt:
1818  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
1819  break;
1820  case CCValAssign::AExt:
1821  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
1822  break;
1823  default:
1824  llvm_unreachable("Unknown loc info!");
1825  }
1826 
1827  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1828  Flag = Chain.getValue(1);
1829  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1830  }
1831 
1832  // FIXME: Does sret work properly?
1833  if (!Info->isEntryFunction()) {
1834  const SIRegisterInfo *TRI
1835  = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
1836  const MCPhysReg *I =
1838  if (I) {
1839  for (; *I; ++I) {
1840  if (AMDGPU::SReg_64RegClass.contains(*I))
1841  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
1842  else if (AMDGPU::SReg_32RegClass.contains(*I))
1843  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
1844  else
1845  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1846  }
1847  }
1848  }
1849 
1850  // Update chain and glue.
1851  RetOps[0] = Chain;
1852  if (Flag.getNode())
1853  RetOps.push_back(Flag);
1854 
1855  unsigned Opc = AMDGPUISD::ENDPGM;
1856  if (!IsWaveEnd)
1858  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1859 }
1860 
1862  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
1863  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1864  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
1865  SDValue ThisVal) const {
1866  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
1867 
1868  // Assign locations to each value returned by this call.
1870  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
1871  *DAG.getContext());
1872  CCInfo.AnalyzeCallResult(Ins, RetCC);
1873 
1874  // Copy all of the result registers out of their specified physreg.
1875  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1876  CCValAssign VA = RVLocs[i];
1877  SDValue Val;
1878 
1879  if (VA.isRegLoc()) {
1880  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
1881  Chain = Val.getValue(1);
1882  InFlag = Val.getValue(2);
1883  } else if (VA.isMemLoc()) {
1884  report_fatal_error("TODO: return values in memory");
1885  } else
1886  llvm_unreachable("unknown argument location type");
1887 
1888  switch (VA.getLocInfo()) {
1889  case CCValAssign::Full:
1890  break;
1891  case CCValAssign::BCvt:
1892  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
1893  break;
1894  case CCValAssign::ZExt:
1895  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
1896  DAG.getValueType(VA.getValVT()));
1897  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1898  break;
1899  case CCValAssign::SExt:
1900  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
1901  DAG.getValueType(VA.getValVT()));
1902  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1903  break;
1904  case CCValAssign::AExt:
1905  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1906  break;
1907  default:
1908  llvm_unreachable("Unknown loc info!");
1909  }
1910 
1911  InVals.push_back(Val);
1912  }
1913 
1914  return Chain;
1915 }
1916 
1917 // Add code to pass special inputs required depending on used features separate
1918 // from the explicit user arguments present in the IR.
1920  CallLoweringInfo &CLI,
1921  const SIMachineFunctionInfo &Info,
1922  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
1923  SmallVectorImpl<SDValue> &MemOpChains,
1924  SDValue Chain,
1925  SDValue StackPtr) const {
1926  // If we don't have a call site, this was a call inserted by
1927  // legalization. These can never use special inputs.
1928  if (!CLI.CS)
1929  return;
1930 
1931  const Function *CalleeFunc = CLI.CS.getCalledFunction();
1932  assert(CalleeFunc);
1933 
1934  SelectionDAG &DAG = CLI.DAG;
1935  const SDLoc &DL = CLI.DL;
1936 
1937  const SISubtarget *ST = getSubtarget();
1938  const SIRegisterInfo *TRI = ST->getRegisterInfo();
1939 
1940  auto &ArgUsageInfo =
1942  const AMDGPUFunctionArgInfo &CalleeArgInfo
1943  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
1944 
1945  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
1946 
1947  // TODO: Unify with private memory register handling. This is complicated by
1948  // the fact that at least in kernels, the input argument is not necessarily
1949  // in the same location as the input.
1962  };
1963 
1964  for (auto InputID : InputRegs) {
1965  const ArgDescriptor *OutgoingArg;
1966  const TargetRegisterClass *ArgRC;
1967 
1968  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
1969  if (!OutgoingArg)
1970  continue;
1971 
1972  const ArgDescriptor *IncomingArg;
1973  const TargetRegisterClass *IncomingArgRC;
1974  std::tie(IncomingArg, IncomingArgRC)
1975  = CallerArgInfo.getPreloadedValue(InputID);
1976  assert(IncomingArgRC == ArgRC);
1977 
1978  // All special arguments are ints for now.
1979  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
1980  SDValue InputReg;
1981 
1982  if (IncomingArg) {
1983  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
1984  } else {
1985  // The implicit arg ptr is special because it doesn't have a corresponding
1986  // input for kernels, and is computed from the kernarg segment pointer.
1987  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1988  InputReg = getImplicitArgPtr(DAG, DL);
1989  }
1990 
1991  if (OutgoingArg->isRegister()) {
1992  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
1993  } else {
1994  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
1995  InputReg,
1996  OutgoingArg->getStackOffset());
1997  MemOpChains.push_back(ArgStore);
1998  }
1999  }
2000 }
2001 
2003  return CC == CallingConv::Fast;
2004 }
2005 
2006 /// Return true if we might ever do TCO for calls with this calling convention.
2008  switch (CC) {
2009  case CallingConv::C:
2010  return true;
2011  default:
2012  return canGuaranteeTCO(CC);
2013  }
2014 }
2015 
2017  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2018  const SmallVectorImpl<ISD::OutputArg> &Outs,
2019  const SmallVectorImpl<SDValue> &OutVals,
2020  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2021  if (!mayTailCallThisCC(CalleeCC))
2022  return false;
2023 
2024  MachineFunction &MF = DAG.getMachineFunction();
2025  const Function *CallerF = MF.getFunction();
2026  CallingConv::ID CallerCC = CallerF->getCallingConv();
2027  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2028  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2029 
2030  // Kernels aren't callable, and don't have a live in return address so it
2031  // doesn't make sense to do a tail call with entry functions.
2032  if (!CallerPreserved)
2033  return false;
2034 
2035  bool CCMatch = CallerCC == CalleeCC;
2036 
2038  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2039  return true;
2040  return false;
2041  }
2042 
2043  // TODO: Can we handle var args?
2044  if (IsVarArg)
2045  return false;
2046 
2047  for (const Argument &Arg : CallerF->args()) {
2048  if (Arg.hasByValAttr())
2049  return false;
2050  }
2051 
2052  LLVMContext &Ctx = *DAG.getContext();
2053 
2054  // Check that the call results are passed in the same way.
2055  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2056  CCAssignFnForCall(CalleeCC, IsVarArg),
2057  CCAssignFnForCall(CallerCC, IsVarArg)))
2058  return false;
2059 
2060  // The callee has to preserve all registers the caller needs to preserve.
2061  if (!CCMatch) {
2062  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2063  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2064  return false;
2065  }
2066 
2067  // Nothing more to check if the callee is taking no arguments.
2068  if (Outs.empty())
2069  return true;
2070 
2072  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2073 
2074  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2075 
2076  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2077  // If the stack arguments for this call do not fit into our own save area then
2078  // the call cannot be made tail.
2079  // TODO: Is this really necessary?
2080  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2081  return false;
2082 
2083  const MachineRegisterInfo &MRI = MF.getRegInfo();
2084  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2085 }
2086 
2088  if (!CI->isTailCall())
2089  return false;
2090 
2091  const Function *ParentFn = CI->getParent()->getParent();
2092  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2093  return false;
2094 
2095  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2096  return (Attr.getValueAsString() != "true");
2097 }
2098 
2099 // The wave scratch offset register is used as the global base pointer.
2101  SmallVectorImpl<SDValue> &InVals) const {
2102  SelectionDAG &DAG = CLI.DAG;
2103  const SDLoc &DL = CLI.DL;
2105  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2107  SDValue Chain = CLI.Chain;
2108  SDValue Callee = CLI.Callee;
2109  bool &IsTailCall = CLI.IsTailCall;
2110  CallingConv::ID CallConv = CLI.CallConv;
2111  bool IsVarArg = CLI.IsVarArg;
2112  bool IsSibCall = false;
2113  bool IsThisReturn = false;
2114  MachineFunction &MF = DAG.getMachineFunction();
2115 
2116  if (IsVarArg) {
2117  return lowerUnhandledCall(CLI, InVals,
2118  "unsupported call to variadic function ");
2119  }
2120 
2121  if (!CLI.CS.getCalledFunction()) {
2122  return lowerUnhandledCall(CLI, InVals,
2123  "unsupported indirect call to function ");
2124  }
2125 
2126  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2127  return lowerUnhandledCall(CLI, InVals,
2128  "unsupported required tail call to function ");
2129  }
2130 
2131  // The first 4 bytes are reserved for the callee's emergency stack slot.
2132  const unsigned CalleeUsableStackOffset = 4;
2133 
2134  if (IsTailCall) {
2135  IsTailCall = isEligibleForTailCallOptimization(
2136  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2137  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2138  report_fatal_error("failed to perform tail call elimination on a call "
2139  "site marked musttail");
2140  }
2141 
2142  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2143 
2144  // A sibling call is one where we're under the usual C ABI and not planning
2145  // to change that but can still do a tail call:
2146  if (!TailCallOpt && IsTailCall)
2147  IsSibCall = true;
2148 
2149  if (IsTailCall)
2150  ++NumTailCalls;
2151  }
2152 
2153  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
2154  // FIXME: Remove this hack for function pointer types after removing
2155  // support of old address space mapping. In the new address space
2156  // mapping the pointer in default address space is 64 bit, therefore
2157  // does not need this hack.
2158  if (Callee.getValueType() == MVT::i32) {
2159  const GlobalValue *GV = GA->getGlobal();
2160  Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
2161  GA->getTargetFlags());
2162  }
2163  }
2164  assert(Callee.getValueType() == MVT::i64);
2165 
2167 
2168  // Analyze operands of the call, assigning locations to each operand.
2170  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2171  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2172  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2173 
2174  // Get a count of how many bytes are to be pushed on the stack.
2175  unsigned NumBytes = CCInfo.getNextStackOffset();
2176 
2177  if (IsSibCall) {
2178  // Since we're not changing the ABI to make this a tail call, the memory
2179  // operands are already available in the caller's incoming argument space.
2180  NumBytes = 0;
2181  }
2182 
2183  // FPDiff is the byte offset of the call's argument area from the callee's.
2184  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2185  // by this amount for a tail call. In a sibling call it must be 0 because the
2186  // caller will deallocate the entire stack and the callee still expects its
2187  // arguments to begin at SP+0. Completely unused for non-tail calls.
2188  int32_t FPDiff = 0;
2189  MachineFrameInfo &MFI = MF.getFrameInfo();
2191 
2192  SDValue CallerSavedFP;
2193 
2194  // Adjust the stack pointer for the new arguments...
2195  // These operations are automatically eliminated by the prolog/epilog pass
2196  if (!IsSibCall) {
2197  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2198 
2199  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2200 
2201  // In the HSA case, this should be an identity copy.
2202  SDValue ScratchRSrcReg
2203  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2204  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2205 
2206  // TODO: Don't hardcode these registers and get from the callee function.
2207  SDValue ScratchWaveOffsetReg
2208  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2209  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2210 
2211  if (!Info->isEntryFunction()) {
2212  // Avoid clobbering this function's FP value. In the current convention
2213  // callee will overwrite this, so do save/restore around the call site.
2214  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2215  Info->getFrameOffsetReg(), MVT::i32);
2216  }
2217  }
2218 
2219  // Stack pointer relative accesses are done by changing the offset SGPR. This
2220  // is just the VGPR offset component.
2221  SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
2222 
2223  SmallVector<SDValue, 8> MemOpChains;
2224  MVT PtrVT = MVT::i32;
2225 
2226  // Walk the register/memloc assignments, inserting copies/loads.
2227  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2228  ++i, ++realArgIdx) {
2229  CCValAssign &VA = ArgLocs[i];
2230  SDValue Arg = OutVals[realArgIdx];
2231 
2232  // Promote the value if needed.
2233  switch (VA.getLocInfo()) {
2234  case CCValAssign::Full:
2235  break;
2236  case CCValAssign::BCvt:
2237  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2238  break;
2239  case CCValAssign::ZExt:
2240  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2241  break;
2242  case CCValAssign::SExt:
2243  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2244  break;
2245  case CCValAssign::AExt:
2246  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2247  break;
2248  case CCValAssign::FPExt:
2249  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2250  break;
2251  default:
2252  llvm_unreachable("Unknown loc info!");
2253  }
2254 
2255  if (VA.isRegLoc()) {
2256  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2257  } else {
2258  assert(VA.isMemLoc());
2259 
2260  SDValue DstAddr;
2261  MachinePointerInfo DstInfo;
2262 
2263  unsigned LocMemOffset = VA.getLocMemOffset();
2264  int32_t Offset = LocMemOffset;
2265  SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
2266  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
2267 
2268  if (IsTailCall) {
2269  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2270  unsigned OpSize = Flags.isByVal() ?
2271  Flags.getByValSize() : VA.getValVT().getStoreSize();
2272 
2273  Offset = Offset + FPDiff;
2274  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2275 
2276  DstAddr = DAG.getFrameIndex(FI, PtrVT);
2277  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
2278  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2279 
2280  // Make sure any stack arguments overlapping with where we're storing
2281  // are loaded before this eventual operation. Otherwise they'll be
2282  // clobbered.
2283 
2284  // FIXME: Why is this really necessary? This seems to just result in a
2285  // lot of code to copy the stack and write them back to the same
2286  // locations, which are supposed to be immutable?
2287  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2288  } else {
2289  DstAddr = PtrOff;
2290  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2291  }
2292 
2293  if (Outs[i].Flags.isByVal()) {
2294  SDValue SizeNode =
2295  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2296  SDValue Cpy = DAG.getMemcpy(
2297  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2298  /*isVol = */ false, /*AlwaysInline = */ true,
2299  /*isTailCall = */ false, DstInfo,
2301  *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
2302 
2303  MemOpChains.push_back(Cpy);
2304  } else {
2305  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
2306  MemOpChains.push_back(Store);
2307  }
2308  }
2309  }
2310 
2311  // Copy special input registers after user input arguments.
2312  passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
2313 
2314  if (!MemOpChains.empty())
2315  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2316 
2317  // Build a sequence of copy-to-reg nodes chained together with token chain
2318  // and flag operands which copy the outgoing args into the appropriate regs.
2319  SDValue InFlag;
2320  for (auto &RegToPass : RegsToPass) {
2321  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2322  RegToPass.second, InFlag);
2323  InFlag = Chain.getValue(1);
2324  }
2325 
2326 
2327  SDValue PhysReturnAddrReg;
2328  if (IsTailCall) {
2329  // Since the return is being combined with the call, we need to pass on the
2330  // return address.
2331 
2332  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2333  SDValue ReturnAddrReg = CreateLiveInRegister(
2334  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2335 
2336  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2337  MVT::i64);
2338  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2339  InFlag = Chain.getValue(1);
2340  }
2341 
2342  // We don't usually want to end the call-sequence here because we would tidy
2343  // the frame up *after* the call, however in the ABI-changing tail-call case
2344  // we've carefully laid out the parameters so that when sp is reset they'll be
2345  // in the correct location.
2346  if (IsTailCall && !IsSibCall) {
2347  Chain = DAG.getCALLSEQ_END(Chain,
2348  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2349  DAG.getTargetConstant(0, DL, MVT::i32),
2350  InFlag, DL);
2351  InFlag = Chain.getValue(1);
2352  }
2353 
2354  std::vector<SDValue> Ops;
2355  Ops.push_back(Chain);
2356  Ops.push_back(Callee);
2357 
2358  if (IsTailCall) {
2359  // Each tail call may have to adjust the stack by a different amount, so
2360  // this information must travel along with the operation for eventual
2361  // consumption by emitEpilogue.
2362  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2363 
2364  Ops.push_back(PhysReturnAddrReg);
2365  }
2366 
2367  // Add argument registers to the end of the list so that they are known live
2368  // into the call.
2369  for (auto &RegToPass : RegsToPass) {
2370  Ops.push_back(DAG.getRegister(RegToPass.first,
2371  RegToPass.second.getValueType()));
2372  }
2373 
2374  // Add a register mask operand representing the call-preserved registers.
2375 
2377  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2378  assert(Mask && "Missing call preserved mask for calling convention");
2379  Ops.push_back(DAG.getRegisterMask(Mask));
2380 
2381  if (InFlag.getNode())
2382  Ops.push_back(InFlag);
2383 
2384  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2385 
2386  // If we're doing a tall call, use a TC_RETURN here rather than an
2387  // actual call instruction.
2388  if (IsTailCall) {
2389  MFI.setHasTailCall();
2390  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2391  }
2392 
2393  // Returns a chain and a flag for retval copy to use.
2394  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2395  Chain = Call.getValue(0);
2396  InFlag = Call.getValue(1);
2397 
2398  if (CallerSavedFP) {
2399  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2400  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2401  InFlag = Chain.getValue(1);
2402  }
2403 
2404  uint64_t CalleePopBytes = NumBytes;
2405  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2406  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2407  InFlag, DL);
2408  if (!Ins.empty())
2409  InFlag = Chain.getValue(1);
2410 
2411  // Handle result values, copying them out of physregs into vregs that we
2412  // return.
2413  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2414  InVals, IsThisReturn,
2415  IsThisReturn ? OutVals[0] : SDValue());
2416 }
2417 
2418 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2419  SelectionDAG &DAG) const {
2420  unsigned Reg = StringSwitch<unsigned>(RegName)
2421  .Case("m0", AMDGPU::M0)
2422  .Case("exec", AMDGPU::EXEC)
2423  .Case("exec_lo", AMDGPU::EXEC_LO)
2424  .Case("exec_hi", AMDGPU::EXEC_HI)
2425  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2426  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2427  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2428  .Default(AMDGPU::NoRegister);
2429 
2430  if (Reg == AMDGPU::NoRegister) {
2431  report_fatal_error(Twine("invalid register name \""
2432  + StringRef(RegName) + "\"."));
2433 
2434  }
2435 
2437  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2438  report_fatal_error(Twine("invalid register \""
2439  + StringRef(RegName) + "\" for subtarget."));
2440  }
2441 
2442  switch (Reg) {
2443  case AMDGPU::M0:
2444  case AMDGPU::EXEC_LO:
2445  case AMDGPU::EXEC_HI:
2446  case AMDGPU::FLAT_SCR_LO:
2447  case AMDGPU::FLAT_SCR_HI:
2448  if (VT.getSizeInBits() == 32)
2449  return Reg;
2450  break;
2451  case AMDGPU::EXEC:
2452  case AMDGPU::FLAT_SCR:
2453  if (VT.getSizeInBits() == 64)
2454  return Reg;
2455  break;
2456  default:
2457  llvm_unreachable("missing register type checking");
2458  }
2459 
2460  report_fatal_error(Twine("invalid type for register \""
2461  + StringRef(RegName) + "\"."));
2462 }
2463 
2464 // If kill is not the last instruction, split the block so kill is always a
2465 // proper terminator.
2467  MachineBasicBlock *BB) const {
2468  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2469 
2470  MachineBasicBlock::iterator SplitPoint(&MI);
2471  ++SplitPoint;
2472 
2473  if (SplitPoint == BB->end()) {
2474  // Don't bother with a new block.
2476  return BB;
2477  }
2478 
2479  MachineFunction *MF = BB->getParent();
2480  MachineBasicBlock *SplitBB
2482 
2483  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2484  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2485 
2486  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2487  BB->addSuccessor(SplitBB);
2488 
2490  return SplitBB;
2491 }
2492 
2493 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2494 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2495 // will only do one iteration. In the worst case, this will loop 64 times.
2496 //
2497 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2499  const SIInstrInfo *TII,
2501  MachineBasicBlock &OrigBB,
2502  MachineBasicBlock &LoopBB,
2503  const DebugLoc &DL,
2504  const MachineOperand &IdxReg,
2505  unsigned InitReg,
2506  unsigned ResultReg,
2507  unsigned PhiReg,
2508  unsigned InitSaveExecReg,
2509  int Offset,
2510  bool UseGPRIdxMode) {
2511  MachineBasicBlock::iterator I = LoopBB.begin();
2512 
2513  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2514  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2515  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2516  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2517 
2518  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2519  .addReg(InitReg)
2520  .addMBB(&OrigBB)
2521  .addReg(ResultReg)
2522  .addMBB(&LoopBB);
2523 
2524  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2525  .addReg(InitSaveExecReg)
2526  .addMBB(&OrigBB)
2527  .addReg(NewExec)
2528  .addMBB(&LoopBB);
2529 
2530  // Read the next variant <- also loop target.
2531  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2532  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2533 
2534  // Compare the just read M0 value to all possible Idx values.
2535  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2536  .addReg(CurrentIdxReg)
2537  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2538 
2539  if (UseGPRIdxMode) {
2540  unsigned IdxReg;
2541  if (Offset == 0) {
2542  IdxReg = CurrentIdxReg;
2543  } else {
2544  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2545  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2546  .addReg(CurrentIdxReg, RegState::Kill)
2547  .addImm(Offset);
2548  }
2549 
2550  MachineInstr *SetIdx =
2551  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
2552  .addReg(IdxReg, RegState::Kill);
2553  SetIdx->getOperand(2).setIsUndef();
2554  } else {
2555  // Move index from VCC into M0
2556  if (Offset == 0) {
2557  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2558  .addReg(CurrentIdxReg, RegState::Kill);
2559  } else {
2560  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2561  .addReg(CurrentIdxReg, RegState::Kill)
2562  .addImm(Offset);
2563  }
2564  }
2565 
2566  // Update EXEC, save the original EXEC value to VCC.
2567  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2568  .addReg(CondReg, RegState::Kill);
2569 
2570  MRI.setSimpleHint(NewExec, CondReg);
2571 
2572  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2573  MachineInstr *InsertPt =
2574  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2575  .addReg(AMDGPU::EXEC)
2576  .addReg(NewExec);
2577 
2578  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2579  // s_cbranch_scc0?
2580 
2581  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2582  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2583  .addMBB(&LoopBB);
2584 
2585  return InsertPt->getIterator();
2586 }
2587 
2588 // This has slightly sub-optimal regalloc when the source vector is killed by
2589 // the read. The register allocator does not understand that the kill is
2590 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2591 // subregister from it, using 1 more VGPR than necessary. This was saved when
2592 // this was expanded after register allocation.
2594  MachineBasicBlock &MBB,
2595  MachineInstr &MI,
2596  unsigned InitResultReg,
2597  unsigned PhiReg,
2598  int Offset,
2599  bool UseGPRIdxMode) {
2600  MachineFunction *MF = MBB.getParent();
2602  const DebugLoc &DL = MI.getDebugLoc();
2604 
2605  unsigned DstReg = MI.getOperand(0).getReg();
2606  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2607  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2608 
2609  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2610 
2611  // Save the EXEC mask
2612  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2613  .addReg(AMDGPU::EXEC);
2614 
2615  // To insert the loop we need to split the block. Move everything after this
2616  // point to a new block, and insert a new empty block between the two.
2618  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2619  MachineFunction::iterator MBBI(MBB);
2620  ++MBBI;
2621 
2622  MF->insert(MBBI, LoopBB);
2623  MF->insert(MBBI, RemainderBB);
2624 
2625  LoopBB->addSuccessor(LoopBB);
2626  LoopBB->addSuccessor(RemainderBB);
2627 
2628  // Move the rest of the block into a new block.
2629  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2630  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2631 
2632  MBB.addSuccessor(LoopBB);
2633 
2634  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2635 
2636  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2637  InitResultReg, DstReg, PhiReg, TmpExec,
2638  Offset, UseGPRIdxMode);
2639 
2640  MachineBasicBlock::iterator First = RemainderBB->begin();
2641  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2642  .addReg(SaveExec);
2643 
2644  return InsPt;
2645 }
2646 
2647 // Returns subreg index, offset
2648 static std::pair<unsigned, int>
2650  const TargetRegisterClass *SuperRC,
2651  unsigned VecReg,
2652  int Offset) {
2653  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2654 
2655  // Skip out of bounds offsets, or else we would end up using an undefined
2656  // register.
2657  if (Offset >= NumElts || Offset < 0)
2658  return std::make_pair(AMDGPU::sub0, Offset);
2659 
2660  return std::make_pair(AMDGPU::sub0 + Offset, 0);
2661 }
2662 
2663 // Return true if the index is an SGPR and was set.
2666  MachineInstr &MI,
2667  int Offset,
2668  bool UseGPRIdxMode,
2669  bool IsIndirectSrc) {
2670  MachineBasicBlock *MBB = MI.getParent();
2671  const DebugLoc &DL = MI.getDebugLoc();
2673 
2674  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2675  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2676 
2677  assert(Idx->getReg() != AMDGPU::NoRegister);
2678 
2679  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2680  return false;
2681 
2682  if (UseGPRIdxMode) {
2683  unsigned IdxMode = IsIndirectSrc ?
2685  if (Offset == 0) {
2686  MachineInstr *SetOn =
2687  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2688  .add(*Idx)
2689  .addImm(IdxMode);
2690 
2691  SetOn->getOperand(3).setIsUndef();
2692  } else {
2693  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2694  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2695  .add(*Idx)
2696  .addImm(Offset);
2697  MachineInstr *SetOn =
2698  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2699  .addReg(Tmp, RegState::Kill)
2700  .addImm(IdxMode);
2701 
2702  SetOn->getOperand(3).setIsUndef();
2703  }
2704 
2705  return true;
2706  }
2707 
2708  if (Offset == 0) {
2709  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2710  .add(*Idx);
2711  } else {
2712  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2713  .add(*Idx)
2714  .addImm(Offset);
2715  }
2716 
2717  return true;
2718 }
2719 
2720 // Control flow needs to be inserted if indexing with a VGPR.
2722  MachineBasicBlock &MBB,
2723  const SISubtarget &ST) {
2724  const SIInstrInfo *TII = ST.getInstrInfo();
2725  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2726  MachineFunction *MF = MBB.getParent();
2728 
2729  unsigned Dst = MI.getOperand(0).getReg();
2730  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2731  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2732 
2733  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2734 
2735  unsigned SubReg;
2736  std::tie(SubReg, Offset)
2737  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
2738 
2739  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2740 
2741  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
2743  const DebugLoc &DL = MI.getDebugLoc();
2744 
2745  if (UseGPRIdxMode) {
2746  // TODO: Look at the uses to avoid the copy. This may require rescheduling
2747  // to avoid interfering with other uses, so probably requires a new
2748  // optimization pass.
2749  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2750  .addReg(SrcReg, RegState::Undef, SubReg)
2751  .addReg(SrcReg, RegState::Implicit)
2752  .addReg(AMDGPU::M0, RegState::Implicit);
2753  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2754  } else {
2755  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2756  .addReg(SrcReg, RegState::Undef, SubReg)
2757  .addReg(SrcReg, RegState::Implicit);
2758  }
2759 
2760  MI.eraseFromParent();
2761 
2762  return &MBB;
2763  }
2764 
2765  const DebugLoc &DL = MI.getDebugLoc();
2767 
2768  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2769  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2770 
2771  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
2772 
2773  if (UseGPRIdxMode) {
2774  MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2775  .addImm(0) // Reset inside loop.
2777  SetOn->getOperand(3).setIsUndef();
2778 
2779  // Disable again after the loop.
2780  BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2781  }
2782 
2783  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
2784  MachineBasicBlock *LoopBB = InsPt->getParent();
2785 
2786  if (UseGPRIdxMode) {
2787  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2788  .addReg(SrcReg, RegState::Undef, SubReg)
2789  .addReg(SrcReg, RegState::Implicit)
2790  .addReg(AMDGPU::M0, RegState::Implicit);
2791  } else {
2792  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2793  .addReg(SrcReg, RegState::Undef, SubReg)
2794  .addReg(SrcReg, RegState::Implicit);
2795  }
2796 
2797  MI.eraseFromParent();
2798 
2799  return LoopBB;
2800 }
2801 
2802 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
2803  const TargetRegisterClass *VecRC) {
2804  switch (TRI.getRegSizeInBits(*VecRC)) {
2805  case 32: // 4 bytes
2806  return AMDGPU::V_MOVRELD_B32_V1;
2807  case 64: // 8 bytes
2808  return AMDGPU::V_MOVRELD_B32_V2;
2809  case 128: // 16 bytes
2810  return AMDGPU::V_MOVRELD_B32_V4;
2811  case 256: // 32 bytes
2812  return AMDGPU::V_MOVRELD_B32_V8;
2813  case 512: // 64 bytes
2814  return AMDGPU::V_MOVRELD_B32_V16;
2815  default:
2816  llvm_unreachable("unsupported size for MOVRELD pseudos");
2817  }
2818 }
2819 
2821  MachineBasicBlock &MBB,
2822  const SISubtarget &ST) {
2823  const SIInstrInfo *TII = ST.getInstrInfo();
2824  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2825  MachineFunction *MF = MBB.getParent();
2827 
2828  unsigned Dst = MI.getOperand(0).getReg();
2829  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
2830  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2831  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
2832  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2833  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
2834 
2835  // This can be an immediate, but will be folded later.
2836  assert(Val->getReg());
2837 
2838  unsigned SubReg;
2839  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
2840  SrcVec->getReg(),
2841  Offset);
2842  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2843 
2844  if (Idx->getReg() == AMDGPU::NoRegister) {
2846  const DebugLoc &DL = MI.getDebugLoc();
2847 
2848  assert(Offset == 0);
2849 
2850  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
2851  .add(*SrcVec)
2852  .add(*Val)
2853  .addImm(SubReg);
2854 
2855  MI.eraseFromParent();
2856  return &MBB;
2857  }
2858 
2859  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
2861  const DebugLoc &DL = MI.getDebugLoc();
2862 
2863  if (UseGPRIdxMode) {
2864  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2865  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
2866  .add(*Val)
2867  .addReg(Dst, RegState::ImplicitDefine)
2868  .addReg(SrcVec->getReg(), RegState::Implicit)
2869  .addReg(AMDGPU::M0, RegState::Implicit);
2870 
2871  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2872  } else {
2873  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2874 
2875  BuildMI(MBB, I, DL, MovRelDesc)
2876  .addReg(Dst, RegState::Define)
2877  .addReg(SrcVec->getReg())
2878  .add(*Val)
2879  .addImm(SubReg - AMDGPU::sub0);
2880  }
2881 
2882  MI.eraseFromParent();
2883  return &MBB;
2884  }
2885 
2886  if (Val->isReg())
2887  MRI.clearKillFlags(Val->getReg());
2888 
2889  const DebugLoc &DL = MI.getDebugLoc();
2890 
2891  if (UseGPRIdxMode) {
2893 
2894  MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2895  .addImm(0) // Reset inside loop.
2897  SetOn->getOperand(3).setIsUndef();
2898 
2899  // Disable again after the loop.
2900  BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2901  }
2902 
2903  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
2904 
2905  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
2906  Offset, UseGPRIdxMode);
2907  MachineBasicBlock *LoopBB = InsPt->getParent();
2908 
2909  if (UseGPRIdxMode) {
2910  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2911  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
2912  .add(*Val) // src0
2914  .addReg(PhiReg, RegState::Implicit)
2915  .addReg(AMDGPU::M0, RegState::Implicit);
2916  } else {
2917  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2918 
2919  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
2920  .addReg(Dst, RegState::Define)
2921  .addReg(PhiReg)
2922  .add(*Val)
2923  .addImm(SubReg - AMDGPU::sub0);
2924  }
2925 
2926  MI.eraseFromParent();
2927 
2928  return LoopBB;
2929 }
2930 
2932  MachineInstr &MI, MachineBasicBlock *BB) const {
2933 
2934  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2935  MachineFunction *MF = BB->getParent();
2937 
2938  if (TII->isMIMG(MI)) {
2939  if (!MI.memoperands_empty())
2940  return BB;
2941  // Add a memoperand for mimg instructions so that they aren't assumed to
2942  // be ordered memory instuctions.
2943 
2944  MachinePointerInfo PtrInfo(MFI->getImagePSV());
2946  if (MI.mayStore())
2947  Flags |= MachineMemOperand::MOStore;
2948 
2949  if (MI.mayLoad())
2950  Flags |= MachineMemOperand::MOLoad;
2951 
2952  auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
2953  MI.addMemOperand(*MF, MMO);
2954  return BB;
2955  }
2956 
2957  switch (MI.getOpcode()) {
2958  case AMDGPU::S_ADD_U64_PSEUDO:
2959  case AMDGPU::S_SUB_U64_PSEUDO: {
2961  const DebugLoc &DL = MI.getDebugLoc();
2962 
2963  MachineOperand &Dest = MI.getOperand(0);
2964  MachineOperand &Src0 = MI.getOperand(1);
2965  MachineOperand &Src1 = MI.getOperand(2);
2966 
2967  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2968  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2969 
2970  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
2971  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
2972  &AMDGPU::SReg_32_XM0RegClass);
2973  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
2974  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
2975  &AMDGPU::SReg_32_XM0RegClass);
2976 
2977  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
2978  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
2979  &AMDGPU::SReg_32_XM0RegClass);
2980  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
2981  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
2982  &AMDGPU::SReg_32_XM0RegClass);
2983 
2984  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
2985 
2986  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
2987  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
2988  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
2989  .add(Src0Sub0)
2990  .add(Src1Sub0);
2991  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
2992  .add(Src0Sub1)
2993  .add(Src1Sub1);
2994  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
2995  .addReg(DestSub0)
2996  .addImm(AMDGPU::sub0)
2997  .addReg(DestSub1)
2998  .addImm(AMDGPU::sub1);
2999  MI.eraseFromParent();
3000  return BB;
3001  }
3002  case AMDGPU::SI_INIT_M0: {
3003  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3004  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3005  .add(MI.getOperand(0));
3006  MI.eraseFromParent();
3007  return BB;
3008  }
3009  case AMDGPU::SI_INIT_EXEC:
3010  // This should be before all vector instructions.
3011  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3012  AMDGPU::EXEC)
3013  .addImm(MI.getOperand(0).getImm());
3014  MI.eraseFromParent();
3015  return BB;
3016 
3017  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3018  // Extract the thread count from an SGPR input and set EXEC accordingly.
3019  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3020  //
3021  // S_BFE_U32 count, input, {shift, 7}
3022  // S_BFM_B64 exec, count, 0
3023  // S_CMP_EQ_U32 count, 64
3024  // S_CMOV_B64 exec, -1
3025  MachineInstr *FirstMI = &*BB->begin();
3027  unsigned InputReg = MI.getOperand(0).getReg();
3028  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3029  bool Found = false;
3030 
3031  // Move the COPY of the input reg to the beginning, so that we can use it.
3032  for (auto I = BB->begin(); I != &MI; I++) {
3033  if (I->getOpcode() != TargetOpcode::COPY ||
3034  I->getOperand(0).getReg() != InputReg)
3035  continue;
3036 
3037  if (I == FirstMI) {
3038  FirstMI = &*++BB->begin();
3039  } else {
3040  I->removeFromParent();
3041  BB->insert(FirstMI, &*I);
3042  }
3043  Found = true;
3044  break;
3045  }
3046  assert(Found);
3047  (void)Found;
3048 
3049  // This should be before all vector instructions.
3050  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3051  .addReg(InputReg)
3052  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3053  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3054  AMDGPU::EXEC)
3055  .addReg(CountReg)
3056  .addImm(0);
3057  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3058  .addReg(CountReg, RegState::Kill)
3059  .addImm(64);
3060  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3061  AMDGPU::EXEC)
3062  .addImm(-1);
3063  MI.eraseFromParent();
3064  return BB;
3065  }
3066 
3067  case AMDGPU::GET_GROUPSTATICSIZE: {
3068  DebugLoc DL = MI.getDebugLoc();
3069  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3070  .add(MI.getOperand(0))
3071  .addImm(MFI->getLDSSize());
3072  MI.eraseFromParent();
3073  return BB;
3074  }
3075  case AMDGPU::SI_INDIRECT_SRC_V1:
3076  case AMDGPU::SI_INDIRECT_SRC_V2:
3077  case AMDGPU::SI_INDIRECT_SRC_V4:
3078  case AMDGPU::SI_INDIRECT_SRC_V8:
3079  case AMDGPU::SI_INDIRECT_SRC_V16:
3080  return emitIndirectSrc(MI, *BB, *getSubtarget());
3081  case AMDGPU::SI_INDIRECT_DST_V1:
3082  case AMDGPU::SI_INDIRECT_DST_V2:
3083  case AMDGPU::SI_INDIRECT_DST_V4:
3084  case AMDGPU::SI_INDIRECT_DST_V8:
3085  case AMDGPU::SI_INDIRECT_DST_V16:
3086  return emitIndirectDst(MI, *BB, *getSubtarget());
3087  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3088  case AMDGPU::SI_KILL_I1_PSEUDO:
3089  return splitKillBlock(MI, BB);
3090  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3092 
3093  unsigned Dst = MI.getOperand(0).getReg();
3094  unsigned Src0 = MI.getOperand(1).getReg();
3095  unsigned Src1 = MI.getOperand(2).getReg();
3096  const DebugLoc &DL = MI.getDebugLoc();
3097  unsigned SrcCond = MI.getOperand(3).getReg();
3098 
3099  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3100  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3101  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3102 
3103  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3104  .addReg(SrcCond);
3105  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3106  .addReg(Src0, 0, AMDGPU::sub0)
3107  .addReg(Src1, 0, AMDGPU::sub0)
3108  .addReg(SrcCondCopy);
3109  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3110  .addReg(Src0, 0, AMDGPU::sub1)
3111  .addReg(Src1, 0, AMDGPU::sub1)
3112  .addReg(SrcCondCopy);
3113 
3114  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3115  .addReg(DstLo)
3116  .addImm(AMDGPU::sub0)
3117  .addReg(DstHi)
3118  .addImm(AMDGPU::sub1);
3119  MI.eraseFromParent();
3120  return BB;
3121  }
3122  case AMDGPU::SI_BR_UNDEF: {
3123  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3124  const DebugLoc &DL = MI.getDebugLoc();
3125  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3126  .add(MI.getOperand(0));
3127  Br->getOperand(1).setIsUndef(true); // read undef SCC
3128  MI.eraseFromParent();
3129  return BB;
3130  }
3131  case AMDGPU::ADJCALLSTACKUP:
3132  case AMDGPU::ADJCALLSTACKDOWN: {
3134  MachineInstrBuilder MIB(*MF, &MI);
3135  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3136  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
3137  return BB;
3138  }
3139  case AMDGPU::SI_CALL_ISEL:
3140  case AMDGPU::SI_TCRETURN_ISEL: {
3141  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3142  const DebugLoc &DL = MI.getDebugLoc();
3143  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3144 
3146  unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3147  MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3148  assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3149 
3150  const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3151 
3152  MachineInstrBuilder MIB;
3153  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3154  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3155  .add(MI.getOperand(0))
3156  .addGlobalAddress(G);
3157  } else {
3158  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3159  .add(MI.getOperand(0))
3160  .addGlobalAddress(G);
3161 
3162  // There is an additional imm operand for tcreturn, but it should be in the
3163  // right place already.
3164  }
3165 
3166  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3167  MIB.add(MI.getOperand(I));
3168 
3170  MI.eraseFromParent();
3171  return BB;
3172  }
3173  default:
3175  }
3176 }
3177 
3179  return isTypeLegal(VT.getScalarType());
3180 }
3181 
3183  // This currently forces unfolding various combinations of fsub into fma with
3184  // free fneg'd operands. As long as we have fast FMA (controlled by
3185  // isFMAFasterThanFMulAndFAdd), we should perform these.
3186 
3187  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3188  // most of these combines appear to be cycle neutral but save on instruction
3189  // count / code size.
3190  return true;
3191 }
3192 
3194  EVT VT) const {
3195  if (!VT.isVector()) {
3196  return MVT::i1;
3197  }
3198  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3199 }
3200 
3202  // TODO: Should i16 be used always if legal? For now it would force VALU
3203  // shifts.
3204  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3205 }
3206 
3207 // Answering this is somewhat tricky and depends on the specific device which
3208 // have different rates for fma or all f64 operations.
3209 //
3210 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3211 // regardless of which device (although the number of cycles differs between
3212 // devices), so it is always profitable for f64.
3213 //
3214 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3215 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3216 // which we can always do even without fused FP ops since it returns the same
3217 // result as the separate operations and since it is always full
3218 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3219 // however does not support denormals, so we do report fma as faster if we have
3220 // a fast fma device and require denormals.
3221 //
3223  VT = VT.getScalarType();
3224 
3225  switch (VT.getSimpleVT().SimpleTy) {
3226  case MVT::f32:
3227  // This is as fast on some subtargets. However, we always have full rate f32
3228  // mad available which returns the same result as the separate operations
3229  // which we should prefer over fma. We can't use this if we want to support
3230  // denormals, so only report this in these cases.
3232  case MVT::f64:
3233  return true;
3234  case MVT::f16:
3236  default:
3237  break;
3238  }
3239 
3240  return false;
3241 }
3242 
3243 //===----------------------------------------------------------------------===//
3244 // Custom DAG Lowering Operations
3245 //===----------------------------------------------------------------------===//
3246 
3248  switch (Op.getOpcode()) {
3249  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3250  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3251  case ISD::LOAD: {
3252  SDValue Result = LowerLOAD(Op, DAG);
3253  assert((!Result.getNode() ||
3254  Result.getNode()->getNumValues() == 2) &&
3255  "Load should return a value and a chain");
3256  return Result;
3257  }
3258 
3259  case ISD::FSIN:
3260  case ISD::FCOS:
3261  return LowerTrig(Op, DAG);
3262  case ISD::SELECT: return LowerSELECT(Op, DAG);
3263  case ISD::FDIV: return LowerFDIV(Op, DAG);
3264  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3265  case ISD::STORE: return LowerSTORE(Op, DAG);
3266  case ISD::GlobalAddress: {
3267  MachineFunction &MF = DAG.getMachineFunction();
3269  return LowerGlobalAddress(MFI, Op, DAG);
3270  }
3271  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3272  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3273  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3274  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3276  return lowerINSERT_VECTOR_ELT(Op, DAG);
3278  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3279  case ISD::FP_ROUND:
3280  return lowerFP_ROUND(Op, DAG);
3281  case ISD::TRAP:
3282  case ISD::DEBUGTRAP:
3283  return lowerTRAP(Op, DAG);
3284  }
3285  return SDValue();
3286 }
3287 
3290  SelectionDAG &DAG) const {
3291  switch (N->getOpcode()) {
3292  case ISD::INSERT_VECTOR_ELT: {
3293  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3294  Results.push_back(Res);
3295  return;
3296  }
3297  case ISD::EXTRACT_VECTOR_ELT: {
3298  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3299  Results.push_back(Res);
3300  return;
3301  }
3302  case ISD::INTRINSIC_WO_CHAIN: {
3303  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3304  if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
3305  SDValue Src0 = N->getOperand(1);
3306  SDValue Src1 = N->getOperand(2);
3307  SDLoc SL(N);
3309  Src0, Src1);
3310  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3311  return;
3312  }
3313  break;
3314  }
3315  case ISD::SELECT: {
3316  SDLoc SL(N);
3317  EVT VT = N->getValueType(0);
3318  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3319  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3320  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3321 
3322  EVT SelectVT = NewVT;
3323  if (NewVT.bitsLT(MVT::i32)) {
3324  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3325  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3326  SelectVT = MVT::i32;
3327  }
3328 
3329  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3330  N->getOperand(0), LHS, RHS);
3331 
3332  if (NewVT != SelectVT)
3333  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3334  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3335  return;
3336  }
3337  default:
3338  break;
3339  }
3340 }
3341 
3342 /// \brief Helper function for LowerBRCOND
3343 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3344 
3345  SDNode *Parent = Value.getNode();
3346  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3347  I != E; ++I) {
3348 
3349  if (I.getUse().get() != Value)
3350  continue;
3351 
3352  if (I->getOpcode() == Opcode)
3353  return *I;
3354  }
3355  return nullptr;
3356 }
3357 
3358 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3359  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3360  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3361  case Intrinsic::amdgcn_if:
3362  return AMDGPUISD::IF;
3363  case Intrinsic::amdgcn_else:
3364  return AMDGPUISD::ELSE;
3365  case Intrinsic::amdgcn_loop:
3366  return AMDGPUISD::LOOP;
3367  case Intrinsic::amdgcn_end_cf:
3368  llvm_unreachable("should not occur");
3369  default:
3370  return 0;
3371  }
3372  }
3373 
3374  // break, if_break, else_break are all only used as inputs to loop, not
3375  // directly as branch conditions.
3376  return 0;
3377 }
3378 
3379 void SITargetLowering::createDebuggerPrologueStackObjects(
3380  MachineFunction &MF) const {
3381  // Create stack objects that are used for emitting debugger prologue.
3382  //
3383  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3384  // at fixed location in the following format:
3385  // offset 0: work group ID x
3386  // offset 4: work group ID y
3387  // offset 8: work group ID z
3388  // offset 16: work item ID x
3389  // offset 20: work item ID y
3390  // offset 24: work item ID z
3392  int ObjectIdx = 0;
3393 
3394  // For each dimension:
3395  for (unsigned i = 0; i < 3; ++i) {
3396  // Create fixed stack object for work group ID.
3397  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3398  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3399  // Create fixed stack object for work item ID.
3400  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3401  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3402  }
3403 }
3404 
3405 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3406  const Triple &TT = getTargetMachine().getTargetTriple();
3407  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
3409 }
3410 
3411 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3412  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3414  !shouldEmitFixup(GV) &&
3416 }
3417 
3418 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3419  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3420 }
3421 
3422 /// This transforms the control flow intrinsics to get the branch destination as
3423 /// last parameter, also switches branch target with BR if the need arise
3424 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3425  SelectionDAG &DAG) const {
3426  SDLoc DL(BRCOND);
3427 
3428  SDNode *Intr = BRCOND.getOperand(1).getNode();
3429  SDValue Target = BRCOND.getOperand(2);
3430  SDNode *BR = nullptr;
3431  SDNode *SetCC = nullptr;
3432 
3433  if (Intr->getOpcode() == ISD::SETCC) {
3434  // As long as we negate the condition everything is fine
3435  SetCC = Intr;
3436  Intr = SetCC->getOperand(0).getNode();
3437 
3438  } else {
3439  // Get the target from BR if we don't negate the condition
3440  BR = findUser(BRCOND, ISD::BR);
3441  Target = BR->getOperand(1);
3442  }
3443 
3444  // FIXME: This changes the types of the intrinsics instead of introducing new
3445  // nodes with the correct types.
3446  // e.g. llvm.amdgcn.loop
3447 
3448  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3449  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3450 
3451  unsigned CFNode = isCFIntrinsic(Intr);
3452  if (CFNode == 0) {
3453  // This is a uniform branch so we don't need to legalize.
3454  return BRCOND;
3455  }
3456 
3457  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3458  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3459 
3460  assert(!SetCC ||
3461  (SetCC->getConstantOperandVal(1) == 1 &&
3462  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3463  ISD::SETNE));
3464 
3465  // operands of the new intrinsic call
3467  if (HaveChain)
3468  Ops.push_back(BRCOND.getOperand(0));
3469 
3470  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
3471  Ops.push_back(Target);
3472 
3473  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3474 
3475  // build the new intrinsic call
3476  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3477 
3478  if (!HaveChain) {
3479  SDValue Ops[] = {
3480  SDValue(Result, 0),
3481  BRCOND.getOperand(0)
3482  };
3483 
3484  Result = DAG.getMergeValues(Ops, DL).getNode();
3485  }
3486 
3487  if (BR) {
3488  // Give the branch instruction our target
3489  SDValue Ops[] = {
3490  BR->getOperand(0),
3491  BRCOND.getOperand(2)
3492  };
3493  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3494  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3495  BR = NewBR.getNode();
3496  }
3497 
3498  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3499 
3500  // Copy the intrinsic results to registers
3501  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
3503  if (!CopyToReg)
3504  continue;
3505 
3506  Chain = DAG.getCopyToReg(
3507  Chain, DL,
3508  CopyToReg->getOperand(1),
3509  SDValue(Result, i - 1),
3510  SDValue());
3511 
3512  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
3513  }
3514 
3515  // Remove the old intrinsic from the chain
3517  SDValue(Intr, Intr->getNumValues() - 1),
3518  Intr->getOperand(0));
3519 
3520  return Chain;
3521 }
3522 
3523 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
3524  SDValue Op,
3525  const SDLoc &DL,
3526  EVT VT) const {
3527  return Op.getValueType().bitsLE(VT) ?
3528  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
3529  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
3530 }
3531 
3532 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
3533  assert(Op.getValueType() == MVT::f16 &&
3534  "Do not know how to custom lower FP_ROUND for non-f16 type");
3535 
3536  SDValue Src = Op.getOperand(0);
3537  EVT SrcVT = Src.getValueType();
3538  if (SrcVT != MVT::f64)
3539  return Op;
3540 
3541  SDLoc DL(Op);
3542 
3543  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
3544  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
3545  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
3546 }
3547 
3548 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
3549  SDLoc SL(Op);
3550  MachineFunction &MF = DAG.getMachineFunction();
3551  SDValue Chain = Op.getOperand(0);
3552 
3553  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
3555 
3559  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3560  assert(UserSGPR != AMDGPU::NoRegister);
3561 
3562  SDValue QueuePtr = CreateLiveInRegister(
3563  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3564 
3565  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
3566 
3567  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
3568  QueuePtr, SDValue());
3569 
3570  SDValue Ops[] = {
3571  ToReg,
3572  DAG.getTargetConstant(TrapID, SL, MVT::i16),
3573  SGPR01,
3574  ToReg.getValue(1)
3575  };
3576 
3577  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
3578  }
3579 
3580  switch (TrapID) {
3582  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
3585  "debugtrap handler not supported",
3586  Op.getDebugLoc(),
3587  DS_Warning);
3588  LLVMContext &Ctx = MF.getFunction()->getContext();
3589  Ctx.diagnose(NoTrap);
3590  return Chain;
3591  }
3592  default:
3593  llvm_unreachable("unsupported trap handler type!");
3594  }
3595 
3596  return Chain;
3597 }
3598 
3599 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
3600  SelectionDAG &DAG) const {
3601  // FIXME: Use inline constants (src_{shared, private}_base) instead.
3602  if (Subtarget->hasApertureRegs()) {
3603  unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
3606  unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
3609  unsigned Encoding =
3611  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
3612  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
3613 
3614  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
3615  SDValue ApertureReg = SDValue(
3616  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
3617  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
3618  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
3619  }
3620 
3621  MachineFunction &MF = DAG.getMachineFunction();
3623  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3624  assert(UserSGPR != AMDGPU::NoRegister);
3625 
3626  SDValue QueuePtr = CreateLiveInRegister(
3627  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3628 
3629  // Offset into amd_queue_t for group_segment_aperture_base_hi /
3630  // private_segment_aperture_base_hi.
3631  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
3632 
3633  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
3634  DAG.getConstant(StructOffset, DL, MVT::i64));
3635 
3636  // TODO: Use custom target PseudoSourceValue.
3637  // TODO: We should use the value from the IR intrinsic call, but it might not
3638  // be available and how do we get it?
3641 
3642  MachinePointerInfo PtrInfo(V, StructOffset);
3643  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
3644  MinAlign(64, StructOffset),
3647 }
3648 
3649 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
3650  SelectionDAG &DAG) const {
3651  SDLoc SL(Op);
3652  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
3653 
3654  SDValue Src = ASC->getOperand(0);
3655  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
3656 
3657  const AMDGPUTargetMachine &TM =
3658  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
3659 
3660  // flat -> local/private
3661  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
3662  unsigned DestAS = ASC->getDestAddressSpace();
3663 
3664  if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
3665  DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
3666  unsigned NullVal = TM.getNullPointerValue(DestAS);
3667  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3668  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
3669  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
3670 
3671  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
3672  NonNull, Ptr, SegmentNullPtr);
3673  }
3674  }
3675 
3676  // local/private -> flat
3677  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
3678  unsigned SrcAS = ASC->getSrcAddressSpace();
3679 
3680  if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
3681  SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
3682  unsigned NullVal = TM.getNullPointerValue(SrcAS);
3683  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3684 
3685  SDValue NonNull
3686  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
3687 
3688  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
3689  SDValue CvtPtr
3690  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
3691 
3692  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
3693  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
3694  FlatNullPtr);
3695  }
3696  }
3697 
3698  // global <-> flat are no-ops and never emitted.
3699 
3700  const MachineFunction &MF = DAG.getMachineFunction();
3701  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
3702  *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
3703  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
3704 
3705  return DAG.getUNDEF(ASC->getValueType(0));
3706 }
3707 
3708 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
3709  SelectionDAG &DAG) const {
3710  SDValue Idx = Op.getOperand(2);
3711  if (isa<ConstantSDNode>(Idx))
3712  return SDValue();
3713 
3714  // Avoid stack access for dynamic indexing.
3715  SDLoc SL(Op);
3716  SDValue Vec = Op.getOperand(0);
3717  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
3718 
3719  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
3720  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
3721 
3722  // Convert vector index to bit-index.
3723  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
3724  DAG.getConstant(16, SL, MVT::i32));
3725 
3726  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3727 
3728  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
3729  DAG.getConstant(0xffff, SL, MVT::i32),
3730  ScaledIdx);
3731 
3732  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
3733  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
3734  DAG.getNOT(SL, BFM, MVT::i32), BCVec);
3735 
3736  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
3737  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
3738 }
3739 
3740 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
3741  SelectionDAG &DAG) const {
3742  SDLoc SL(Op);
3743 
3744  EVT ResultVT = Op.getValueType();
3745  SDValue Vec = Op.getOperand(0);
3746  SDValue Idx = Op.getOperand(1);
3747 
3748  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
3749 
3750  // Make sure we we do any optimizations that will make it easier to fold
3751  // source modifiers before obscuring it with bit operations.
3752 
3753  // XXX - Why doesn't this get called when vector_shuffle is expanded?
3754  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
3755  return Combined;
3756 
3757  if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
3758  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3759 
3760  if (CIdx->getZExtValue() == 1) {
3761  Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
3762  DAG.getConstant(16, SL, MVT::i32));
3763  } else {
3764  assert(CIdx->getZExtValue() == 0);
3765  }
3766 
3767  if (ResultVT.bitsLT(MVT::i32))
3768  Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3769  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3770  }
3771 
3772  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
3773 
3774  // Convert vector index to bit-index.
3775  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
3776 
3777  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3778  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
3779 
3780  SDValue Result = Elt;
3781  if (ResultVT.bitsLT(MVT::i32))
3782  Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3783 
3784  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3785 }
3786 
3787 bool
3789  // We can fold offsets for anything that doesn't require a GOT relocation.
3790  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3792  !shouldEmitGOTReloc(GA->getGlobal());
3793 }
3794 
3795 static SDValue
3797  const SDLoc &DL, unsigned Offset, EVT PtrVT,
3798  unsigned GAFlags = SIInstrInfo::MO_NONE) {
3799  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
3800  // lowered to the following code sequence:
3801  //
3802  // For constant address space:
3803  // s_getpc_b64 s[0:1]
3804  // s_add_u32 s0, s0, $symbol
3805  // s_addc_u32 s1, s1, 0
3806  //
3807  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3808  // a fixup or relocation is emitted to replace $symbol with a literal
3809  // constant, which is a pc-relative offset from the encoding of the $symbol
3810  // operand to the global variable.
3811  //
3812  // For global address space:
3813  // s_getpc_b64 s[0:1]
3814  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3815  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3816  //
3817  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3818  // fixups or relocations are emitted to replace $symbol@*@lo and
3819  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3820  // which is a 64-bit pc-relative offset from the encoding of the $symbol
3821  // operand to the global variable.
3822  //
3823  // What we want here is an offset from the value returned by s_getpc
3824  // (which is the address of the s_add_u32 instruction) to the global
3825  // variable, but since the encoding of $symbol starts 4 bytes after the start
3826  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
3827  // small. This requires us to add 4 to the global variable offset in order to
3828  // compute the correct address.
3829  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3830  GAFlags);
3831  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3832  GAFlags == SIInstrInfo::MO_NONE ?
3833  GAFlags : GAFlags + 1);
3834  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
3835 }
3836 
3837 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
3838  SDValue Op,
3839  SelectionDAG &DAG) const {
3840  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
3841  const GlobalValue *GV = GSD->getGlobal();
3842 
3843  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
3845  // FIXME: It isn't correct to rely on the type of the pointer. This should
3846  // be removed when address space 0 is 64-bit.
3847  !GV->getType()->getElementType()->isFunctionTy())
3848  return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
3849 
3850  SDLoc DL(GSD);
3851  EVT PtrVT = Op.getValueType();
3852 
3853  if (shouldEmitFixup(GV))
3854  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
3855  else if (shouldEmitPCReloc(GV))
3856  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
3858 
3859  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
3861 
3862  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
3864  const DataLayout &DataLayout = DAG.getDataLayout();
3865  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
3866  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
3867  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
3868 
3869  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
3872 }
3873 
3875  const SDLoc &DL, SDValue V) const {
3876  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
3877  // the destination register.
3878  //
3879  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
3880  // so we will end up with redundant moves to m0.
3881  //
3882  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
3883 
3884  // A Null SDValue creates a glue result.
3885  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
3886  V, Chain);
3887  return SDValue(M0, 0);
3888 }
3889 
3890 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
3891  SDValue Op,
3892  MVT VT,
3893  unsigned Offset) const {
3894  SDLoc SL(Op);
3895  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
3896  DAG.getEntryNode(), Offset, false);
3897  // The local size values will have the hi 16-bits as zero.
3898  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
3899  DAG.getValueType(VT));
3900 }
3901 
3903  EVT VT) {
3905  "non-hsa intrinsic with hsa target",
3906  DL.getDebugLoc());
3907  DAG.getContext()->diagnose(BadIntrin);
3908  return DAG.getUNDEF(VT);
3909 }
3910 
3912  EVT VT) {
3914  "intrinsic not supported on subtarget",
3915  DL.getDebugLoc());
3916  DAG.getContext()->diagnose(BadIntrin);
3917  return DAG.getUNDEF(VT);
3918 }
3919 
3920 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3921  SelectionDAG &DAG) const {
3922  MachineFunction &MF = DAG.getMachineFunction();
3923  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
3924 
3925  EVT VT = Op.getValueType();
3926  SDLoc DL(Op);
3927  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3928 
3929  // TODO: Should this propagate fast-math-flags?
3930 
3931  switch (IntrinsicID) {
3932  case Intrinsic::amdgcn_implicit_buffer_ptr: {
3933  if (getSubtarget()->isAmdCodeObjectV2(MF))
3934  return emitNonHSAIntrinsicError(DAG, DL, VT);
3935  return getPreloadedValue(DAG, *MFI, VT,
3937  }
3938  case Intrinsic::amdgcn_dispatch_ptr:
3939  case Intrinsic::amdgcn_queue_ptr: {
3940  if (!Subtarget->isAmdCodeObjectV2(MF)) {
3941  DiagnosticInfoUnsupported BadIntrin(
3942  *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
3943  DL.getDebugLoc());
3944  DAG.getContext()->diagnose(BadIntrin);
3945  return DAG.getUNDEF(VT);
3946  }
3947 
3948  auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
3950  return getPreloadedValue(DAG, *MFI, VT, RegID);
3951  }
3952  case Intrinsic::amdgcn_implicitarg_ptr: {
3953  if (MFI->isEntryFunction())
3954  return getImplicitArgPtr(DAG, DL);
3955  return getPreloadedValue(DAG, *MFI, VT,
3957  }
3958  case Intrinsic::amdgcn_kernarg_segment_ptr: {
3959  return getPreloadedValue(DAG, *MFI, VT,
3961  }
3962  case Intrinsic::amdgcn_dispatch_id: {
3963  return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
3964  }
3965  case Intrinsic::amdgcn_rcp:
3966  return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
3967  case Intrinsic::amdgcn_rsq:
3968  return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3969  case Intrinsic::amdgcn_rsq_legacy:
3971  return emitRemovedIntrinsicError(DAG, DL, VT);
3972 
3973  return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
3974  case Intrinsic::amdgcn_rcp_legacy:
3976  return emitRemovedIntrinsicError(DAG, DL, VT);
3977  return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
3978  case Intrinsic::amdgcn_rsq_clamp: {
3980  return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
3981 
3982  Type *Type = VT.getTypeForEVT(*DAG.getContext());
3984  APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
3985 
3986  SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3987  SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
3988  DAG.getConstantFP(Max, DL, VT));
3989  return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
3990  DAG.getConstantFP(Min, DL, VT));
3991  }
3992  case Intrinsic::r600_read_ngroups_x:
3993  if (Subtarget->isAmdHsaOS())
3994  return emitNonHSAIntrinsicError(DAG, DL, VT);
3995 
3996  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3998  case Intrinsic::r600_read_ngroups_y:
3999  if (Subtarget->isAmdHsaOS())
4000  return emitNonHSAIntrinsicError(DAG, DL, VT);
4001 
4002  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4004  case Intrinsic::r600_read_ngroups_z:
4005  if (Subtarget->isAmdHsaOS())
4006  return emitNonHSAIntrinsicError(DAG, DL, VT);
4007 
4008  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4010  case Intrinsic::r600_read_global_size_x:
4011  if (Subtarget->isAmdHsaOS())
4012  return emitNonHSAIntrinsicError(DAG, DL, VT);
4013 
4014  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4016  case Intrinsic::r600_read_global_size_y:
4017  if (Subtarget->isAmdHsaOS())
4018  return emitNonHSAIntrinsicError(DAG, DL, VT);
4019 
4020  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4022  case Intrinsic::r600_read_global_size_z:
4023  if (Subtarget->isAmdHsaOS())
4024  return emitNonHSAIntrinsicError(DAG, DL, VT);
4025 
4026  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4028  case Intrinsic::r600_read_local_size_x:
4029  if (Subtarget->isAmdHsaOS())
4030  return emitNonHSAIntrinsicError(DAG, DL, VT);
4031 
4032  return lowerImplicitZextParam(DAG, Op, MVT::i16,
4034  case Intrinsic::r600_read_local_size_y:
4035  if (Subtarget->isAmdHsaOS())
4036  return emitNonHSAIntrinsicError(DAG, DL, VT);
4037 
4038  return lowerImplicitZextParam(DAG, Op, MVT::i16,
4040  case Intrinsic::r600_read_local_size_z:
4041  if (Subtarget->isAmdHsaOS())
4042  return emitNonHSAIntrinsicError(DAG, DL, VT);
4043 
4044  return lowerImplicitZextParam(DAG, Op, MVT::i16,
4046  case Intrinsic::amdgcn_workgroup_id_x:
4047  case Intrinsic::r600_read_tgid_x:
4048  return getPreloadedValue(DAG, *MFI, VT,
4050  case Intrinsic::amdgcn_workgroup_id_y:
4051  case Intrinsic::r600_read_tgid_y:
4052  return getPreloadedValue(DAG, *MFI, VT,
4054  case Intrinsic::amdgcn_workgroup_id_z:
4055  case Intrinsic::r600_read_tgid_z:
4056  return getPreloadedValue(DAG, *MFI, VT,
4058  case Intrinsic::amdgcn_workitem_id_x: {
4059  case Intrinsic::r600_read_tidig_x:
4060  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4061  SDLoc(DAG.getEntryNode()),
4062  MFI->getArgInfo().WorkItemIDX);
4063  }
4064  case Intrinsic::amdgcn_workitem_id_y:
4065  case Intrinsic::r600_read_tidig_y:
4066  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4067  SDLoc(DAG.getEntryNode()),
4068  MFI->getArgInfo().WorkItemIDY);
4069  case Intrinsic::amdgcn_workitem_id_z:
4070  case Intrinsic::r600_read_tidig_z:
4071  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4072  SDLoc(DAG.getEntryNode()),
4073  MFI->getArgInfo().WorkItemIDZ);
4074  case AMDGPUIntrinsic::SI_load_const: {
4075  SDValue Ops[] = {
4076  Op.getOperand(1),
4077  Op.getOperand(2)
4078  };
4079 
4084  VT.getStoreSize(), 4);
4086  Op->getVTList(), Ops, VT, MMO);
4087  }
4088  case Intrinsic::amdgcn_fdiv_fast:
4089  return lowerFDIV_FAST(Op, DAG);
4090  case Intrinsic::amdgcn_interp_mov: {
4091  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4092  SDValue Glue = M0.getValue(1);
4093  return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
4094  Op.getOperand(2), Op.getOperand(3), Glue);
4095  }
4096  case Intrinsic::amdgcn_interp_p1: {
4097  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4098  SDValue Glue = M0.getValue(1);
4099  return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
4100  Op.getOperand(2), Op.getOperand(3), Glue);
4101  }
4102  case Intrinsic::amdgcn_interp_p2: {
4103  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
4104  SDValue Glue = SDValue(M0.getNode(), 1);
4105  return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
4106  Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
4107  Glue);
4108  }
4109  case Intrinsic::amdgcn_sin:
4110  return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
4111 
4112  case Intrinsic::amdgcn_cos:
4113  return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
4114 
4115  case Intrinsic::amdgcn_log_clamp: {
4117  return SDValue();
4118 
4119  DiagnosticInfoUnsupported BadIntrin(
4120  *MF.getFunction(), "intrinsic not supported on subtarget",
4121  DL.getDebugLoc());
4122  DAG.getContext()->diagnose(BadIntrin);
4123  return DAG.getUNDEF(VT);
4124  }
4125  case Intrinsic::amdgcn_ldexp:
4126  return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
4127  Op.getOperand(1), Op.getOperand(2));
4128 
4129  case Intrinsic::amdgcn_fract:
4130  return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
4131 
4132  case Intrinsic::amdgcn_class:
4133  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
4134  Op.getOperand(1), Op.getOperand(2));
4135  case Intrinsic::amdgcn_div_fmas:
4136  return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
4137  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
4138  Op.getOperand(4));
4139 
4140  case Intrinsic::amdgcn_div_fixup:
4141  return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
4142  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4143 
4144  case Intrinsic::amdgcn_trig_preop:
4145  return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
4146  Op.getOperand(1), Op.getOperand(2));
4147  case Intrinsic::amdgcn_div_scale: {
4148  // 3rd parameter required to be a constant.
4149  const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4150  if (!Param)
4151  return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
4152 
4153  // Translate to the operands expected by the machine instruction. The
4154  // first parameter must be the same as the first instruction.
4155  SDValue Numerator = Op.getOperand(1);
4156  SDValue Denominator = Op.getOperand(2);
4157 
4158  // Note this order is opposite of the machine instruction's operations,
4159  // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
4160  // intrinsic has the numerator as the first operand to match a normal
4161  // division operation.
4162 
4163  SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
4164 
4165  return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
4166  Denominator, Numerator);
4167  }
4168  case Intrinsic::amdgcn_icmp: {
4169  const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4170  if (!CD)
4171  return DAG.getUNDEF(VT);
4172 
4173  int CondCode = CD->getSExtValue();
4174  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
4175  CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
4176  return DAG.getUNDEF(VT);
4177 
4178  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4179  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4180  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4181  Op.getOperand(2), DAG.getCondCode(CCOpcode));
4182  }
4183  case Intrinsic::amdgcn_fcmp: {
4184  const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4185  if (!CD)
4186  return DAG.getUNDEF(VT);
4187 
4188  int CondCode = CD->getSExtValue();
4189  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
4190  CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
4191  return DAG.getUNDEF(VT);
4192 
4193  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4194  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4195  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4196  Op.getOperand(2), DAG.getCondCode(CCOpcode));
4197  }
4198  case Intrinsic::amdgcn_fmed3:
4199  return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
4200  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4201  case Intrinsic::amdgcn_fmul_legacy:
4202  return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
4203  Op.getOperand(1), Op.getOperand(2));
4204  case Intrinsic::amdgcn_sffbh:
4205  return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
4206  case Intrinsic::amdgcn_sbfe:
4207  return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
4208  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4209  case Intrinsic::amdgcn_ubfe:
4210  return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
4211  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4212  case Intrinsic::amdgcn_cvt_pkrtz: {
4213  // FIXME: Stop adding cast if v2f16 legal.
4214  EVT VT = Op.getValueType();
4216  Op.getOperand(1), Op.getOperand(2));
4217  return DAG.getNode(ISD::BITCAST, DL, VT, Node);
4218  }
4219  case Intrinsic::amdgcn_wqm: {
4220  SDValue Src = Op.getOperand(1);
4221  return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
4222  0);
4223  }
4224  case Intrinsic::amdgcn_wwm: {
4225  SDValue Src = Op.getOperand(1);
4226  return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
4227  0);
4228  }
4229  default:
4230  return Op;
4231  }
4232 }
4233 
4234 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4235  SelectionDAG &DAG) const {
4236  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4237  SDLoc DL(Op);
4238  MachineFunction &MF = DAG.getMachineFunction();
4239 
4240  switch (IntrID) {
4241  case Intrinsic::amdgcn_atomic_inc:
4242  case Intrinsic::amdgcn_atomic_dec: {
4243  MemSDNode *M = cast<MemSDNode>(Op);
4244  unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
4246  SDValue Ops[] = {
4247  M->getOperand(0), // Chain
4248  M->getOperand(2), // Ptr
4249  M->getOperand(3) // Value
4250  };
4251 
4252  return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
4253  M->getMemoryVT(), M->getMemOperand());
4254  }
4255  case Intrinsic::amdgcn_buffer_load:
4256  case Intrinsic::amdgcn_buffer_load_format: {
4257  SDValue Ops[] = {
4258  Op.getOperand(0), // Chain
4259  Op.getOperand(2), // rsrc
4260  Op.getOperand(3), // vindex
4261  Op.getOperand(4), // offset
4262  Op.getOperand(5), // glc
4263  Op.getOperand(6) // slc
4264  };
4266 
4267  unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
4269  EVT VT = Op.getValueType();
4270  EVT IntVT = VT.changeTypeToInteger();
4271 
4275  VT.getStoreSize(), VT.getStoreSize());
4276 
4277  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
4278  }
4279  case Intrinsic::amdgcn_tbuffer_load: {
4280  SDValue Ops[] = {
4281  Op.getOperand(0), // Chain
4282  Op.getOperand(2), // rsrc
4283  Op.getOperand(3), // vindex
4284  Op.getOperand(4), // voffset
4285  Op.getOperand(5), // soffset
4286  Op.getOperand(6), // offset
4287  Op.getOperand(7), // dfmt
4288  Op.getOperand(8), // nfmt
4289  Op.getOperand(9), // glc
4290  Op.getOperand(10) // slc
4291  };
4292 
4293  EVT VT = Op.getOperand(2).getValueType();
4294 
4298  VT.getStoreSize(), VT.getStoreSize());
4300  Op->getVTList(), Ops, VT, MMO);
4301  }
4302  case Intrinsic::amdgcn_buffer_atomic_swap:
4303  case Intrinsic::amdgcn_buffer_atomic_add:
4304  case Intrinsic::amdgcn_buffer_atomic_sub:
4305  case Intrinsic::amdgcn_buffer_atomic_smin:
4306  case Intrinsic::amdgcn_buffer_atomic_umin:
4307  case Intrinsic::amdgcn_buffer_atomic_smax:
4308  case Intrinsic::amdgcn_buffer_atomic_umax:
4309  case Intrinsic::amdgcn_buffer_atomic_and:
4310  case Intrinsic::amdgcn_buffer_atomic_or:
4311  case Intrinsic::amdgcn_buffer_atomic_xor: {
4312  SDValue Ops[] = {
4313  Op.getOperand(0), // Chain
4314  Op.getOperand(2), // vdata
4315  Op.getOperand(3), // rsrc
4316  Op.getOperand(4), // vindex
4317  Op.getOperand(5), // offset
4318  Op.getOperand(6) // slc
4319  };
4320  EVT VT = Op.getOperand(3).getValueType();
4327  VT.getStoreSize(), 4);
4328  unsigned Opcode = 0;
4329 
4330  switch (IntrID) {
4331  case Intrinsic::amdgcn_buffer_atomic_swap:
4333  break;
4334  case Intrinsic::amdgcn_buffer_atomic_add:
4336  break;
4337  case Intrinsic::amdgcn_buffer_atomic_sub:
4339  break;
4340  case Intrinsic::amdgcn_buffer_atomic_smin:
4342  break;
4343  case Intrinsic::amdgcn_buffer_atomic_umin:
4345  break;
4346  case Intrinsic::amdgcn_buffer_atomic_smax:
4348  break;
4349  case Intrinsic::amdgcn_buffer_atomic_umax:
4351  break;
4352  case Intrinsic::amdgcn_buffer_atomic_and:
4354  break;
4355  case Intrinsic::amdgcn_buffer_atomic_or:
4356  Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
4357  break;
4358  case Intrinsic::amdgcn_buffer_atomic_xor:
4360  break;
4361  default:
4362  llvm_unreachable("unhandled atomic opcode");
4363  }
4364 
4365  return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
4366  }
4367 
4368  case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
4369  SDValue Ops[] = {
4370  Op.getOperand(0), // Chain
4371  Op.getOperand(2), // src
4372  Op.getOperand(3), // cmp
4373  Op.getOperand(4), // rsrc
4374  Op.getOperand(5), // vindex
4375  Op.getOperand(6), // offset
4376  Op.getOperand(7) // slc
4377  };
4378  EVT VT = Op.getOperand(4).getValueType();
4385  VT.getStoreSize(), 4);
4386 
4388  Op->getVTList(), Ops, VT, MMO);
4389  }
4390 
4391  // Basic sample.
4392  case Intrinsic::amdgcn_image_sample:
4393  case Intrinsic::amdgcn_image_sample_cl:
4394  case Intrinsic::amdgcn_image_sample_d:
4395  case Intrinsic::amdgcn_image_sample_d_cl:
4396  case Intrinsic::amdgcn_image_sample_l:
4397  case Intrinsic::amdgcn_image_sample_b:
4398  case Intrinsic::amdgcn_image_sample_b_cl:
4399  case Intrinsic::amdgcn_image_sample_lz:
4400  case Intrinsic::amdgcn_image_sample_cd:
4401  case Intrinsic::amdgcn_image_sample_cd_cl:
4402 
4403  // Sample with comparison.
4404  case Intrinsic::amdgcn_image_sample_c:
4405  case Intrinsic::amdgcn_image_sample_c_cl:
4406  case Intrinsic::amdgcn_image_sample_c_d:
4407  case Intrinsic::amdgcn_image_sample_c_d_cl:
4408  case Intrinsic::amdgcn_image_sample_c_l:
4409  case Intrinsic::amdgcn_image_sample_c_b:
4410  case Intrinsic::amdgcn_image_sample_c_b_cl:
4411  case Intrinsic::amdgcn_image_sample_c_lz:
4412  case Intrinsic::amdgcn_image_sample_c_cd:
4413  case Intrinsic::amdgcn_image_sample_c_cd_cl:
4414 
4415  // Sample with offsets.
4416  case Intrinsic::amdgcn_image_sample_o:
4417  case Intrinsic::amdgcn_image_sample_cl_o:
4418  case Intrinsic::amdgcn_image_sample_d_o:
4419  case Intrinsic::amdgcn_image_sample_d_cl_o:
4420  case Intrinsic::amdgcn_image_sample_l_o:
4421  case Intrinsic::amdgcn_image_sample_b_o:
4422  case Intrinsic::amdgcn_image_sample_b_cl_o:
4423  case Intrinsic::amdgcn_image_sample_lz_o:
4424  case Intrinsic::amdgcn_image_sample_cd_o:
4425  case Intrinsic::amdgcn_image_sample_cd_cl_o:
4426 
4427  // Sample with comparison and offsets.
4428  case Intrinsic::amdgcn_image_sample_c_o:
4429  case Intrinsic::amdgcn_image_sample_c_cl_o:
4430  case Intrinsic::amdgcn_image_sample_c_d_o:
4431  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
4432  case Intrinsic::amdgcn_image_sample_c_l_o:
4433  case Intrinsic::amdgcn_image_sample_c_b_o:
4434  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
4435  case Intrinsic::amdgcn_image_sample_c_lz_o:
4436  case Intrinsic::amdgcn_image_sample_c_cd_o:
4437  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
4438 
4439  case Intrinsic::amdgcn_image_getlod: {
4440  // Replace dmask with everything disabled with undef.
4441  const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
4442  if (!DMask || DMask->isNullValue()) {
4443  SDValue Undef = DAG.getUNDEF(Op.getValueType());
4444  return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
4445  }
4446 
4447  return SDValue();
4448  }
4449  default:
4450  return SDValue();
4451  }
4452 }
4453 
4454 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4455  SelectionDAG &DAG) const {
4456  SDLoc DL(Op);
4457  SDValue Chain = Op.getOperand(0);
4458  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4459  MachineFunction &MF = DAG.getMachineFunction();
4460 
4461  switch (IntrinsicID) {
4462  case Intrinsic::amdgcn_exp: {
4463  const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4464  const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4465  const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
4466  const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
4467 
4468  const SDValue Ops[] = {
4469  Chain,
4470  DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4471  DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
4472  Op.getOperand(4), // src0
4473  Op.getOperand(5), // src1
4474  Op.getOperand(6), // src2
4475  Op.getOperand(7), // src3
4476  DAG.getTargetConstant(0, DL, MVT::i1), // compr
4477  DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4478  };
4479 
4480  unsigned Opc = Done->isNullValue() ?
4482  return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4483  }
4484  case Intrinsic::amdgcn_exp_compr: {
4485  const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4486  const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4487  SDValue Src0 = Op.getOperand(4);
4488  SDValue Src1 = Op.getOperand(5);
4489  const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
4490  const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
4491 
4492  SDValue Undef = DAG.getUNDEF(MVT::f32);
4493  const SDValue Ops[] = {
4494  Chain,
4495  DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4496  DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
4497  DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
4498  DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
4499  Undef, // src2
4500  Undef, // src3
4501  DAG.getTargetConstant(1, DL, MVT::i1), // compr
4502  DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4503  };
4504 
4505  unsigned Opc = Done->isNullValue() ?
4507  return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4508  }
4509  case Intrinsic::amdgcn_s_sendmsg:
4510  case Intrinsic::amdgcn_s_sendmsghalt: {
4511  unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
4513  Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
4514  SDValue Glue = Chain.getValue(1);
4515  return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
4516  Op.getOperand(2), Glue);
4517  }
4518  case Intrinsic::amdgcn_init_exec: {
4519  return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
4520  Op.getOperand(2));
4521  }
4522  case Intrinsic::amdgcn_init_exec_from_input: {
4523  return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
4524  Op.getOperand(2), Op.getOperand(3));
4525  }
4526  case AMDGPUIntrinsic::AMDGPU_kill: {
4527  SDValue Src = Op.getOperand(2);
4528  if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
4529  if (!K->isNegative())
4530  return Chain;
4531 
4532  SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
4533  return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
4534  }
4535 
4536  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
4537  return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
4538  }
4539  case Intrinsic::amdgcn_s_barrier: {
4540  if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
4541  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
4542  unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
4543  if (WGSize <= ST.getWavefrontSize())
4544  return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
4545  Op.getOperand(0)), 0);
4546  }
4547  return SDValue();
4548  };
4549  case AMDGPUIntrinsic::SI_tbuffer_store: {
4550 
4551  // Extract vindex and voffset from vaddr as appropriate
4552  const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
4553  const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
4554  SDValue VAddr = Op.getOperand(5);
4555 
4556  SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
4557 
4558  assert(!(OffEn->isOne() && IdxEn->isOne()) &&
4559  "Legacy intrinsic doesn't support both offset and index - use new version");
4560 
4561  SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
4562  SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
4563 
4564  // Deal with the vec-3 case
4565  const