LLVM  7.0.0svn
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #endif
19 
20 #include "SIISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/APFloat.h"
32 #include "llvm/ADT/APInt.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/ADT/Twine.h"
40 #include "llvm/CodeGen/Analysis.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
73 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/KnownBits.h"
79 #include <cassert>
80 #include <cmath>
81 #include <cstdint>
82 #include <iterator>
83 #include <tuple>
84 #include <utility>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-lower"
90 
91 STATISTIC(NumTailCalls, "Number of tail calls");
92 
94  "amdgpu-vgpr-index-mode",
95  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96  cl::init(false));
97 
99  "amdgpu-frame-index-zero-bits",
100  cl::desc("High bits of frame index assumed to be zero"),
101  cl::init(5),
103 
104 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108  return AMDGPU::SGPR0 + Reg;
109  }
110  }
111  llvm_unreachable("Cannot allocate sgpr");
112 }
113 
115  const SISubtarget &STI)
116  : AMDGPUTargetLowering(TM, STI) {
117  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
118  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
119 
120  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
121  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
122 
123  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
124  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
125  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
126 
127  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
128  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
129 
130  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
131  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
132 
133  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
134  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
135 
136  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
137  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
138 
139  if (Subtarget->has16BitInsts()) {
140  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
141  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
142 
143  // Unless there are also VOP3P operations, not operations are really legal.
144  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
145  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
146  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
147  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
148  }
149 
151 
152  // We need to custom lower vector stores from local memory
158 
164 
175 
178 
183 
189 
194 
197 
205 
211 
215 
220 
227 
230 
233 
234 #if 0
237 #endif
238 
239  // We only support LOAD/STORE and vector manipulation ops for vectors
240  // with > 4 elements.
243  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
244  switch (Op) {
245  case ISD::LOAD:
246  case ISD::STORE:
247  case ISD::BUILD_VECTOR:
248  case ISD::BITCAST:
254  break;
255  case ISD::CONCAT_VECTORS:
257  break;
258  default:
260  break;
261  }
262  }
263  }
264 
266 
267  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
268  // is expanded to avoid having two separate loops in case the index is a VGPR.
269 
270  // Most operations are naturally 32-bit vector operations. We only support
271  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
272  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
275 
278 
281 
284  }
285 
290 
293 
294  // Avoid stack access for these.
295  // TODO: Generalize to more vector types.
300 
306 
310 
315 
316  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
317  // and output demarshalling
320 
321  // We can't return success/failure, only the old value,
322  // let LLVM add the comparison
325 
326  if (getSubtarget()->hasFlatAddressSpace()) {
329  }
330 
333 
334  // On SI this is s_memtime and s_memrealtime on VI.
338 
341 
346  }
347 
349 
354 
355  if (Subtarget->has16BitInsts()) {
357 
360 
363 
366 
369 
374 
377 
383 
385 
387 
389 
391 
396 
401 
402  // F16 - Constant Actions.
404 
405  // F16 - Load/Store Actions.
410 
411  // F16 - VOP1 Actions.
420 
421  // F16 - VOP2 Actions.
427 
428  // F16 - VOP3 Actions.
430  if (!Subtarget->hasFP16Denormals())
432 
433  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
434  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
435  switch (Op) {
436  case ISD::LOAD:
437  case ISD::STORE:
438  case ISD::BUILD_VECTOR:
439  case ISD::BITCAST:
445  break;
446  case ISD::CONCAT_VECTORS:
448  break;
449  default:
451  break;
452  }
453  }
454  }
455 
456  // XXX - Do these do anything? Vector constants turn into build_vector.
459 
462 
467 
472 
479 
484 
489 
494 
498 
499  if (!Subtarget->hasVOP3PInsts()) {
502  }
503 
505  // This isn't really legal, but this avoids the legalizer unrolling it (and
506  // allows matching fneg (fabs x) patterns)
508  }
509 
510  if (Subtarget->hasVOP3PInsts()) {
521 
528 
531 
538 
543 
548 
551  }
552 
555 
556  if (Subtarget->has16BitInsts()) {
561  } else {
562  // Legalization hack.
565 
568  }
569 
572  }
573 
597 
598  // All memory operations. Some folding on the pointer operand is done to help
599  // matching the constant offsets in the addressing modes.
617 
619 }
620 
622  return static_cast<const SISubtarget *>(Subtarget);
623 }
624 
625 //===----------------------------------------------------------------------===//
626 // TargetLowering queries
627 //===----------------------------------------------------------------------===//
628 
629 // v_mad_mix* support a conversion from f16 to f32.
630 //
631 // There is only one special case when denormals are enabled we don't currently,
632 // where this is OK to use.
634  EVT DestVT, EVT SrcVT) const {
635  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
636  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
637  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
638  SrcVT.getScalarType() == MVT::f16;
639 }
640 
642  // SI has some legal vector types, but no legal vector operations. Say no
643  // shuffles are legal in order to prefer scalarizing some vector operations.
644  return false;
645 }
646 
648  const CallInst &CI,
649  MachineFunction &MF,
650  unsigned IntrID) const {
651  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
652  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
654  (Intrinsic::ID)IntrID);
655  if (Attr.hasFnAttribute(Attribute::ReadNone))
656  return false;
657 
659 
660  if (RsrcIntr->IsImage) {
661  Info.ptrVal = MFI->getImagePSV(
663  CI.getArgOperand(RsrcIntr->RsrcArg));
664  Info.align = 0;
665  } else {
666  Info.ptrVal = MFI->getBufferPSV(
668  CI.getArgOperand(RsrcIntr->RsrcArg));
669  }
670 
672  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
674  Info.memVT = MVT::getVT(CI.getType());
676  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
677  Info.opc = ISD::INTRINSIC_VOID;
678  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
680  } else {
681  // Atomic
683  Info.memVT = MVT::getVT(CI.getType());
687 
688  // XXX - Should this be volatile without known ordering?
690  }
691  return true;
692  }
693 
694  switch (IntrID) {
695  case Intrinsic::amdgcn_atomic_inc:
696  case Intrinsic::amdgcn_atomic_dec:
697  case Intrinsic::amdgcn_ds_fadd:
698  case Intrinsic::amdgcn_ds_fmin:
699  case Intrinsic::amdgcn_ds_fmax: {
701  Info.memVT = MVT::getVT(CI.getType());
702  Info.ptrVal = CI.getOperand(0);
703  Info.align = 0;
705 
706  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
707  if (!Vol || !Vol->isZero())
709 
710  return true;
711  }
712 
713  default:
714  return false;
715  }
716 }
717 
720  Type *&AccessTy) const {
721  switch (II->getIntrinsicID()) {
722  case Intrinsic::amdgcn_atomic_inc:
723  case Intrinsic::amdgcn_atomic_dec:
724  case Intrinsic::amdgcn_ds_fadd:
725  case Intrinsic::amdgcn_ds_fmin:
726  case Intrinsic::amdgcn_ds_fmax: {
727  Value *Ptr = II->getArgOperand(0);
728  AccessTy = II->getType();
729  Ops.push_back(Ptr);
730  return true;
731  }
732  default:
733  return false;
734  }
735 }
736 
737 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
738  if (!Subtarget->hasFlatInstOffsets()) {
739  // Flat instructions do not have offsets, and only have the register
740  // address.
741  return AM.BaseOffs == 0 && AM.Scale == 0;
742  }
743 
744  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
745  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
746 
747  // Just r + i
748  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
749 }
750 
751 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
753  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
754 
756  // Assume the we will use FLAT for all global memory accesses
757  // on VI.
758  // FIXME: This assumption is currently wrong. On VI we still use
759  // MUBUF instructions for the r + i addressing mode. As currently
760  // implemented, the MUBUF instructions only work on buffer < 4GB.
761  // It may be possible to support > 4GB buffers with MUBUF instructions,
762  // by setting the stride value in the resource descriptor which would
763  // increase the size limit to (stride * 4GB). However, this is risky,
764  // because it has never been validated.
765  return isLegalFlatAddressingMode(AM);
766  }
767 
768  return isLegalMUBUFAddressingMode(AM);
769 }
770 
771 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
772  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
773  // additionally can do r + r + i with addr64. 32-bit has more addressing
774  // mode options. Depending on the resource constant, it can also do
775  // (i64 r0) + (i32 r1) * (i14 i).
776  //
777  // Private arrays end up using a scratch buffer most of the time, so also
778  // assume those use MUBUF instructions. Scratch loads / stores are currently
779  // implemented as mubuf instructions with offen bit set, so slightly
780  // different than the normal addr64.
781  if (!isUInt<12>(AM.BaseOffs))
782  return false;
783 
784  // FIXME: Since we can split immediate into soffset and immediate offset,
785  // would it make sense to allow any immediate?
786 
787  switch (AM.Scale) {
788  case 0: // r + i or just i, depending on HasBaseReg.
789  return true;
790  case 1:
791  return true; // We have r + r or r + i.
792  case 2:
793  if (AM.HasBaseReg) {
794  // Reject 2 * r + r.
795  return false;
796  }
797 
798  // Allow 2 * r as r + r
799  // Or 2 * r + i is allowed as r + r + i.
800  return true;
801  default: // Don't allow n * r
802  return false;
803  }
804 }
805 
807  const AddrMode &AM, Type *Ty,
808  unsigned AS, Instruction *I) const {
809  // No global is ever allowed as a base.
810  if (AM.BaseGV)
811  return false;
812 
813  if (AS == AMDGPUASI.GLOBAL_ADDRESS)
814  return isLegalGlobalAddressingMode(AM);
815 
816  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
818  // If the offset isn't a multiple of 4, it probably isn't going to be
819  // correctly aligned.
820  // FIXME: Can we get the real alignment here?
821  if (AM.BaseOffs % 4 != 0)
822  return isLegalMUBUFAddressingMode(AM);
823 
824  // There are no SMRD extloads, so if we have to do a small type access we
825  // will use a MUBUF load.
826  // FIXME?: We also need to do this if unaligned, but we don't know the
827  // alignment here.
828  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
829  return isLegalGlobalAddressingMode(AM);
830 
832  // SMRD instructions have an 8-bit, dword offset on SI.
833  if (!isUInt<8>(AM.BaseOffs / 4))
834  return false;
836  // On CI+, this can also be a 32-bit literal constant offset. If it fits
837  // in 8-bits, it can use a smaller encoding.
838  if (!isUInt<32>(AM.BaseOffs / 4))
839  return false;
841  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
842  if (!isUInt<20>(AM.BaseOffs))
843  return false;
844  } else
845  llvm_unreachable("unhandled generation");
846 
847  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
848  return true;
849 
850  if (AM.Scale == 1 && AM.HasBaseReg)
851  return true;
852 
853  return false;
854 
855  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
856  return isLegalMUBUFAddressingMode(AM);
857  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
858  AS == AMDGPUASI.REGION_ADDRESS) {
859  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
860  // field.
861  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
862  // an 8-bit dword offset but we don't know the alignment here.
863  if (!isUInt<16>(AM.BaseOffs))
864  return false;
865 
866  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
867  return true;
868 
869  if (AM.Scale == 1 && AM.HasBaseReg)
870  return true;
871 
872  return false;
873  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
875  // For an unknown address space, this usually means that this is for some
876  // reason being used for pure arithmetic, and not based on some addressing
877  // computation. We don't have instructions that compute pointers with any
878  // addressing modes, so treat them as having no offset like flat
879  // instructions.
880  return isLegalFlatAddressingMode(AM);
881  } else {
882  llvm_unreachable("unhandled address space");
883  }
884 }
885 
887  const SelectionDAG &DAG) const {
888  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
889  return (MemVT.getSizeInBits() <= 4 * 32);
890  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
891  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
892  return (MemVT.getSizeInBits() <= MaxPrivateBits);
893  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
894  return (MemVT.getSizeInBits() <= 2 * 32);
895  }
896  return true;
897 }
898 
900  unsigned AddrSpace,
901  unsigned Align,
902  bool *IsFast) const {
903  if (IsFast)
904  *IsFast = false;
905 
906  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
907  // which isn't a simple VT.
908  // Until MVT is extended to handle this, simply check for the size and
909  // rely on the condition below: allow accesses if the size is a multiple of 4.
910  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
911  VT.getStoreSize() > 16)) {
912  return false;
913  }
914 
915  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
916  AddrSpace == AMDGPUASI.REGION_ADDRESS) {
917  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
918  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
919  // with adjacent offsets.
920  bool AlignedBy4 = (Align % 4 == 0);
921  if (IsFast)
922  *IsFast = AlignedBy4;
923 
924  return AlignedBy4;
925  }
926 
927  // FIXME: We have to be conservative here and assume that flat operations
928  // will access scratch. If we had access to the IR function, then we
929  // could determine if any private memory was used in the function.
931  (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
932  AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
933  return false;
934  }
935 
937  // If we have an uniform constant load, it still requires using a slow
938  // buffer instruction if unaligned.
939  if (IsFast) {
940  *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
941  AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
942  (Align % 4 == 0) : true;
943  }
944 
945  return true;
946  }
947 
948  // Smaller than dword value must be aligned.
949  if (VT.bitsLT(MVT::i32))
950  return false;
951 
952  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
953  // byte-address are ignored, thus forcing Dword alignment.
954  // This applies to private, global, and constant memory.
955  if (IsFast)
956  *IsFast = true;
957 
958  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
959 }
960 
961 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
962  unsigned SrcAlign, bool IsMemset,
963  bool ZeroMemset,
964  bool MemcpyStrSrc,
965  MachineFunction &MF) const {
966  // FIXME: Should account for address space here.
967 
968  // The default fallback uses the private pointer size as a guess for a type to
969  // use. Make sure we switch these to 64-bit accesses.
970 
971  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
972  return MVT::v4i32;
973 
974  if (Size >= 8 && DstAlign >= 4)
975  return MVT::v2i32;
976 
977  // Use the default.
978  return MVT::Other;
979 }
980 
981 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
982  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
983  AS == AMDGPUASI.FLAT_ADDRESS ||
984  AS == AMDGPUASI.CONSTANT_ADDRESS ||
985  AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
986 }
987 
989  unsigned DestAS) const {
990  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
992 }
993 
995  const MemSDNode *MemNode = cast<MemSDNode>(N);
996  const Value *Ptr = MemNode->getMemOperand()->getValue();
997  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
998  return I && I->getMetadata("amdgpu.noclobber");
999 }
1000 
1002  unsigned DestAS) const {
1003  // Flat -> private/local is a simple truncate.
1004  // Flat -> global is no-op
1005  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
1006  return true;
1007 
1008  return isNoopAddrSpaceCast(SrcAS, DestAS);
1009 }
1010 
1012  const MemSDNode *MemNode = cast<MemSDNode>(N);
1013 
1014  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1015 }
1016 
1019  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1020  return TypeSplitVector;
1021 
1023 }
1024 
1026  Type *Ty) const {
1027  // FIXME: Could be smarter if called for vector constants.
1028  return true;
1029 }
1030 
1032  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1033  switch (Op) {
1034  case ISD::LOAD:
1035  case ISD::STORE:
1036 
1037  // These operations are done with 32-bit instructions anyway.
1038  case ISD::AND:
1039  case ISD::OR:
1040  case ISD::XOR:
1041  case ISD::SELECT:
1042  // TODO: Extensions?
1043  return true;
1044  default:
1045  return false;
1046  }
1047  }
1048 
1049  // SimplifySetCC uses this function to determine whether or not it should
1050  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1051  if (VT == MVT::i1 && Op == ISD::SETCC)
1052  return false;
1053 
1054  return TargetLowering::isTypeDesirableForOp(Op, VT);
1055 }
1056 
1057 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1058  const SDLoc &SL,
1059  SDValue Chain,
1060  uint64_t Offset) const {
1061  const DataLayout &DL = DAG.getDataLayout();
1062  MachineFunction &MF = DAG.getMachineFunction();
1064 
1065  const ArgDescriptor *InputPtrReg;
1066  const TargetRegisterClass *RC;
1067 
1068  std::tie(InputPtrReg, RC)
1070 
1073  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1074  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1075 
1076  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1077 }
1078 
1079 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1080  const SDLoc &SL) const {
1081  auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
1082  uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
1083  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1084 }
1085 
1086 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1087  const SDLoc &SL, SDValue Val,
1088  bool Signed,
1089  const ISD::InputArg *Arg) const {
1090  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1091  VT.bitsLT(MemVT)) {
1092  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1093  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1094  }
1095 
1096  if (MemVT.isFloatingPoint())
1097  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1098  else if (Signed)
1099  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1100  else
1101  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1102 
1103  return Val;
1104 }
1105 
1106 SDValue SITargetLowering::lowerKernargMemParameter(
1107  SelectionDAG &DAG, EVT VT, EVT MemVT,
1108  const SDLoc &SL, SDValue Chain,
1109  uint64_t Offset, unsigned Align, bool Signed,
1110  const ISD::InputArg *Arg) const {
1111  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1113  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1114 
1115 
1116  // Try to avoid using an extload by loading earlier than the argument address,
1117  // and extracting the relevant bits. The load should hopefully be merged with
1118  // the previous argument.
1119  if (Align < 4) {
1120  //if (MemVT.getStoreSize() < 4) {
1121  assert(MemVT.getStoreSize() < 4);
1122  int64_t AlignDownOffset = alignDown(Offset, 4);
1123  int64_t OffsetDiff = Offset - AlignDownOffset;
1124 
1125  EVT IntVT = MemVT.changeTypeToInteger();
1126 
1127  // TODO: If we passed in the base kernel offset we could have a better
1128  // alignment than 4, but we don't really need it.
1129  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1130  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1133 
1134  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1135  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1136 
1137  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1138  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1139  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1140 
1141 
1142  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1143  }
1144 
1145  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1146  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1149 
1150  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1151  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1152 }
1153 
1154 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1155  const SDLoc &SL, SDValue Chain,
1156  const ISD::InputArg &Arg) const {
1157  MachineFunction &MF = DAG.getMachineFunction();
1158  MachineFrameInfo &MFI = MF.getFrameInfo();
1159 
1160  if (Arg.Flags.isByVal()) {
1161  unsigned Size = Arg.Flags.getByValSize();
1162  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1163  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1164  }
1165 
1166  unsigned ArgOffset = VA.getLocMemOffset();
1167  unsigned ArgSize = VA.getValVT().getStoreSize();
1168 
1169  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1170 
1171  // Create load nodes to retrieve arguments from the stack.
1172  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1173  SDValue ArgValue;
1174 
1175  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1177  MVT MemVT = VA.getValVT();
1178 
1179  switch (VA.getLocInfo()) {
1180  default:
1181  break;
1182  case CCValAssign::BCvt:
1183  MemVT = VA.getLocVT();
1184  break;
1185  case CCValAssign::SExt:
1186  ExtType = ISD::SEXTLOAD;
1187  break;
1188  case CCValAssign::ZExt:
1189  ExtType = ISD::ZEXTLOAD;
1190  break;
1191  case CCValAssign::AExt:
1192  ExtType = ISD::EXTLOAD;
1193  break;
1194  }
1195 
1196  ArgValue = DAG.getExtLoad(
1197  ExtType, SL, VA.getLocVT(), Chain, FIN,
1199  MemVT);
1200  return ArgValue;
1201 }
1202 
1203 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1204  const SIMachineFunctionInfo &MFI,
1205  EVT VT,
1207  const ArgDescriptor *Reg;
1208  const TargetRegisterClass *RC;
1209 
1210  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1211  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1212 }
1213 
1215  CallingConv::ID CallConv,
1217  BitVector &Skipped,
1218  FunctionType *FType,
1219  SIMachineFunctionInfo *Info) {
1220  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1221  const ISD::InputArg &Arg = Ins[I];
1222 
1223  // First check if it's a PS input addr.
1224  if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
1225  !Arg.Flags.isByVal() && PSInputNum <= 15) {
1226 
1227  if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
1228  // We can safely skip PS inputs.
1229  Skipped.set(I);
1230  ++PSInputNum;
1231  continue;
1232  }
1233 
1234  Info->markPSInputAllocated(PSInputNum);
1235  if (Arg.Used)
1236  Info->markPSInputEnabled(PSInputNum);
1237 
1238  ++PSInputNum;
1239  }
1240 
1241  // Second split vertices into their elements.
1242  if (Arg.VT.isVector()) {
1243  ISD::InputArg NewArg = Arg;
1244  NewArg.Flags.setSplit();
1245  NewArg.VT = Arg.VT.getVectorElementType();
1246 
1247  // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
1248  // three or five element vertex only needs three or five registers,
1249  // NOT four or eight.
1250  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1251  unsigned NumElements = ParamType->getVectorNumElements();
1252 
1253  for (unsigned J = 0; J != NumElements; ++J) {
1254  Splits.push_back(NewArg);
1255  NewArg.PartOffset += NewArg.VT.getStoreSize();
1256  }
1257  } else {
1258  Splits.push_back(Arg);
1259  }
1260  }
1261 }
1262 
1263 // Allocate special inputs passed in VGPRs.
1265  MachineFunction &MF,
1266  const SIRegisterInfo &TRI,
1267  SIMachineFunctionInfo &Info) {
1268  if (Info.hasWorkItemIDX()) {
1269  unsigned Reg = AMDGPU::VGPR0;
1270  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1271 
1272  CCInfo.AllocateReg(Reg);
1274  }
1275 
1276  if (Info.hasWorkItemIDY()) {
1277  unsigned Reg = AMDGPU::VGPR1;
1278  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1279 
1280  CCInfo.AllocateReg(Reg);
1282  }
1283 
1284  if (Info.hasWorkItemIDZ()) {
1285  unsigned Reg = AMDGPU::VGPR2;
1286  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1287 
1288  CCInfo.AllocateReg(Reg);
1290  }
1291 }
1292 
1293 // Try to allocate a VGPR at the end of the argument list, or if no argument
1294 // VGPRs are left allocating a stack slot.
1296  ArrayRef<MCPhysReg> ArgVGPRs
1297  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1298  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1299  if (RegIdx == ArgVGPRs.size()) {
1300  // Spill to stack required.
1301  int64_t Offset = CCInfo.AllocateStack(4, 4);
1302 
1303  return ArgDescriptor::createStack(Offset);
1304  }
1305 
1306  unsigned Reg = ArgVGPRs[RegIdx];
1307  Reg = CCInfo.AllocateReg(Reg);
1308  assert(Reg != AMDGPU::NoRegister);
1309 
1310  MachineFunction &MF = CCInfo.getMachineFunction();
1311  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1312  return ArgDescriptor::createRegister(Reg);
1313 }
1314 
1316  const TargetRegisterClass *RC,
1317  unsigned NumArgRegs) {
1318  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1319  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1320  if (RegIdx == ArgSGPRs.size())
1321  report_fatal_error("ran out of SGPRs for arguments");
1322 
1323  unsigned Reg = ArgSGPRs[RegIdx];
1324  Reg = CCInfo.AllocateReg(Reg);
1325  assert(Reg != AMDGPU::NoRegister);
1326 
1327  MachineFunction &MF = CCInfo.getMachineFunction();
1328  MF.addLiveIn(Reg, RC);
1329  return ArgDescriptor::createRegister(Reg);
1330 }
1331 
1333  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1334 }
1335 
1337  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1338 }
1339 
1341  MachineFunction &MF,
1342  const SIRegisterInfo &TRI,
1343  SIMachineFunctionInfo &Info) {
1344  if (Info.hasWorkItemIDX())
1345  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1346 
1347  if (Info.hasWorkItemIDY())
1348  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1349 
1350  if (Info.hasWorkItemIDZ())
1351  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1352 }
1353 
1355  MachineFunction &MF,
1356  const SIRegisterInfo &TRI,
1357  SIMachineFunctionInfo &Info) {
1358  auto &ArgInfo = Info.getArgInfo();
1359 
1360  // TODO: Unify handling with private memory pointers.
1361 
1362  if (Info.hasDispatchPtr())
1363  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1364 
1365  if (Info.hasQueuePtr())
1366  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1367 
1368  if (Info.hasKernargSegmentPtr())
1369  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1370 
1371  if (Info.hasDispatchID())
1372  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1373 
1374  // flat_scratch_init is not applicable for non-kernel functions.
1375 
1376  if (Info.hasWorkGroupIDX())
1377  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1378 
1379  if (Info.hasWorkGroupIDY())
1380  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1381 
1382  if (Info.hasWorkGroupIDZ())
1383  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1384 
1385  if (Info.hasImplicitArgPtr())
1386  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1387 }
1388 
1389 // Allocate special inputs passed in user SGPRs.
1390 static void allocateHSAUserSGPRs(CCState &CCInfo,
1391  MachineFunction &MF,
1392  const SIRegisterInfo &TRI,
1393  SIMachineFunctionInfo &Info) {
1394  if (Info.hasImplicitBufferPtr()) {
1395  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1396  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1397  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1398  }
1399 
1400  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1401  if (Info.hasPrivateSegmentBuffer()) {
1402  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1403  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1404  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1405  }
1406 
1407  if (Info.hasDispatchPtr()) {
1408  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1409  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1410  CCInfo.AllocateReg(DispatchPtrReg);
1411  }
1412 
1413  if (Info.hasQueuePtr()) {
1414  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1415  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1416  CCInfo.AllocateReg(QueuePtrReg);
1417  }
1418 
1419  if (Info.hasKernargSegmentPtr()) {
1420  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1421  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1422  CCInfo.AllocateReg(InputPtrReg);
1423  }
1424 
1425  if (Info.hasDispatchID()) {
1426  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1427  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1428  CCInfo.AllocateReg(DispatchIDReg);
1429  }
1430 
1431  if (Info.hasFlatScratchInit()) {
1432  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1433  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1434  CCInfo.AllocateReg(FlatScratchInitReg);
1435  }
1436 
1437  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1438  // these from the dispatch pointer.
1439 }
1440 
1441 // Allocate special input registers that are initialized per-wave.
1442 static void allocateSystemSGPRs(CCState &CCInfo,
1443  MachineFunction &MF,
1444  SIMachineFunctionInfo &Info,
1445  CallingConv::ID CallConv,
1446  bool IsShader) {
1447  if (Info.hasWorkGroupIDX()) {
1448  unsigned Reg = Info.addWorkGroupIDX();
1449  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1450  CCInfo.AllocateReg(Reg);
1451  }
1452 
1453  if (Info.hasWorkGroupIDY()) {
1454  unsigned Reg = Info.addWorkGroupIDY();
1455  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1456  CCInfo.AllocateReg(Reg);
1457  }
1458 
1459  if (Info.hasWorkGroupIDZ()) {
1460  unsigned Reg = Info.addWorkGroupIDZ();
1461  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1462  CCInfo.AllocateReg(Reg);
1463  }
1464 
1465  if (Info.hasWorkGroupInfo()) {
1466  unsigned Reg = Info.addWorkGroupInfo();
1467  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1468  CCInfo.AllocateReg(Reg);
1469  }
1470 
1471  if (Info.hasPrivateSegmentWaveByteOffset()) {
1472  // Scratch wave offset passed in system SGPR.
1473  unsigned PrivateSegmentWaveByteOffsetReg;
1474 
1475  if (IsShader) {
1476  PrivateSegmentWaveByteOffsetReg =
1478 
1479  // This is true if the scratch wave byte offset doesn't have a fixed
1480  // location.
1481  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1482  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1483  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1484  }
1485  } else
1486  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1487 
1488  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1489  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1490  }
1491 }
1492 
1494  MachineFunction &MF,
1495  const SIRegisterInfo &TRI,
1496  SIMachineFunctionInfo &Info) {
1497  // Now that we've figured out where the scratch register inputs are, see if
1498  // should reserve the arguments and use them directly.
1499  MachineFrameInfo &MFI = MF.getFrameInfo();
1500  bool HasStackObjects = MFI.hasStackObjects();
1501 
1502  // Record that we know we have non-spill stack objects so we don't need to
1503  // check all stack objects later.
1504  if (HasStackObjects)
1505  Info.setHasNonSpillStackObjects(true);
1506 
1507  // Everything live out of a block is spilled with fast regalloc, so it's
1508  // almost certain that spilling will be required.
1509  if (TM.getOptLevel() == CodeGenOpt::None)
1510  HasStackObjects = true;
1511 
1512  // For now assume stack access is needed in any callee functions, so we need
1513  // the scratch registers to pass in.
1514  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1515 
1516  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1517  if (ST.isAmdCodeObjectV2(MF.getFunction())) {
1518  if (RequiresStackAccess) {
1519  // If we have stack objects, we unquestionably need the private buffer
1520  // resource. For the Code Object V2 ABI, this will be the first 4 user
1521  // SGPR inputs. We can reserve those and use them directly.
1522 
1523  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1525  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1526 
1527  if (MFI.hasCalls()) {
1528  // If we have calls, we need to keep the frame register in a register
1529  // that won't be clobbered by a call, so ensure it is copied somewhere.
1530 
1531  // This is not a problem for the scratch wave offset, because the same
1532  // registers are reserved in all functions.
1533 
1534  // FIXME: Nothing is really ensuring this is a call preserved register,
1535  // it's just selected from the end so it happens to be.
1536  unsigned ReservedOffsetReg
1538  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1539  } else {
1540  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1542  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1543  }
1544  } else {
1545  unsigned ReservedBufferReg
1547  unsigned ReservedOffsetReg
1549 
1550  // We tentatively reserve the last registers (skipping the last two
1551  // which may contain VCC). After register allocation, we'll replace
1552  // these with the ones immediately after those which were really
1553  // allocated. In the prologue copies will be inserted from the argument
1554  // to these reserved registers.
1555  Info.setScratchRSrcReg(ReservedBufferReg);
1556  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1557  }
1558  } else {
1559  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1560 
1561  // Without HSA, relocations are used for the scratch pointer and the
1562  // buffer resource setup is always inserted in the prologue. Scratch wave
1563  // offset is still in an input SGPR.
1564  Info.setScratchRSrcReg(ReservedBufferReg);
1565 
1566  if (HasStackObjects && !MFI.hasCalls()) {
1567  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1569  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1570  } else {
1571  unsigned ReservedOffsetReg
1573  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1574  }
1575  }
1576 }
1577 
1580  return !Info->isEntryFunction();
1581 }
1582 
1584 
1585 }
1586 
1588  MachineBasicBlock *Entry,
1589  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1591 
1592  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1593  if (!IStart)
1594  return;
1595 
1597  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1598  MachineBasicBlock::iterator MBBI = Entry->begin();
1599  for (const MCPhysReg *I = IStart; *I; ++I) {
1600  const TargetRegisterClass *RC = nullptr;
1601  if (AMDGPU::SReg_64RegClass.contains(*I))
1602  RC = &AMDGPU::SGPR_64RegClass;
1603  else if (AMDGPU::SReg_32RegClass.contains(*I))
1604  RC = &AMDGPU::SGPR_32RegClass;
1605  else
1606  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1607 
1608  unsigned NewVR = MRI->createVirtualRegister(RC);
1609  // Create copy from CSR to a virtual register.
1610  Entry->addLiveIn(*I);
1611  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1612  .addReg(*I);
1613 
1614  // Insert the copy-back instructions right before the terminator.
1615  for (auto *Exit : Exits)
1616  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1617  TII->get(TargetOpcode::COPY), *I)
1618  .addReg(NewVR);
1619  }
1620 }
1621 
1623  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1624  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1625  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1627 
1628  MachineFunction &MF = DAG.getMachineFunction();
1629  const Function &Fn = MF.getFunction();
1630  FunctionType *FType = MF.getFunction().getFunctionType();
1632  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1633 
1634  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1635  DiagnosticInfoUnsupported NoGraphicsHSA(
1636  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1637  DAG.getContext()->diagnose(NoGraphicsHSA);
1638  return DAG.getEntryNode();
1639  }
1640 
1641  // Create stack objects that are used for emitting debugger prologue if
1642  // "amdgpu-debugger-emit-prologue" attribute was specified.
1643  if (ST.debuggerEmitPrologue())
1644  createDebuggerPrologueStackObjects(MF);
1645 
1648  BitVector Skipped(Ins.size());
1649  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1650  *DAG.getContext());
1651 
1652  bool IsShader = AMDGPU::isShader(CallConv);
1653  bool IsKernel = AMDGPU::isKernel(CallConv);
1654  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1655 
1656  if (!IsEntryFunc) {
1657  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1658  // this when allocating argument fixed offsets.
1659  CCInfo.AllocateStack(4, 4);
1660  }
1661 
1662  if (IsShader) {
1663  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1664 
1665  // At least one interpolation mode must be enabled or else the GPU will
1666  // hang.
1667  //
1668  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1669  // set PSInputAddr, the user wants to enable some bits after the compilation
1670  // based on run-time states. Since we can't know what the final PSInputEna
1671  // will look like, so we shouldn't do anything here and the user should take
1672  // responsibility for the correct programming.
1673  //
1674  // Otherwise, the following restrictions apply:
1675  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1676  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1677  // enabled too.
1678  if (CallConv == CallingConv::AMDGPU_PS) {
1679  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1680  ((Info->getPSInputAddr() & 0xF) == 0 &&
1681  Info->isPSInputAllocated(11))) {
1682  CCInfo.AllocateReg(AMDGPU::VGPR0);
1683  CCInfo.AllocateReg(AMDGPU::VGPR1);
1684  Info->markPSInputAllocated(0);
1685  Info->markPSInputEnabled(0);
1686  }
1687  if (Subtarget->isAmdPalOS()) {
1688  // For isAmdPalOS, the user does not enable some bits after compilation
1689  // based on run-time states; the register values being generated here are
1690  // the final ones set in hardware. Therefore we need to apply the
1691  // workaround to PSInputAddr and PSInputEnable together. (The case where
1692  // a bit is set in PSInputAddr but not PSInputEnable is where the
1693  // frontend set up an input arg for a particular interpolation mode, but
1694  // nothing uses that input arg. Really we should have an earlier pass
1695  // that removes such an arg.)
1696  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1697  if ((PsInputBits & 0x7F) == 0 ||
1698  ((PsInputBits & 0xF) == 0 &&
1699  (PsInputBits >> 11 & 1)))
1700  Info->markPSInputEnabled(
1702  }
1703  }
1704 
1705  assert(!Info->hasDispatchPtr() &&
1706  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1707  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1708  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1709  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1710  !Info->hasWorkItemIDZ());
1711  } else if (IsKernel) {
1712  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1713  } else {
1714  Splits.append(Ins.begin(), Ins.end());
1715  }
1716 
1717  if (IsEntryFunc) {
1718  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1719  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1720  }
1721 
1722  if (IsKernel) {
1723  analyzeFormalArgumentsCompute(CCInfo, Ins);
1724  } else {
1725  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1726  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1727  }
1728 
1729  SmallVector<SDValue, 16> Chains;
1730 
1731  // FIXME: This is the minimum kernel argument alignment. We should improve
1732  // this to the maximum alignment of the arguments.
1733  //
1734  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1735  // kern arg offset.
1736  const unsigned KernelArgBaseAlign = 16;
1737  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
1738 
1739  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1740  const ISD::InputArg &Arg = Ins[i];
1741  if (Skipped[i]) {
1742  InVals.push_back(DAG.getUNDEF(Arg.VT));
1743  continue;
1744  }
1745 
1746  CCValAssign &VA = ArgLocs[ArgIdx++];
1747  MVT VT = VA.getLocVT();
1748 
1749  if (IsEntryFunc && VA.isMemLoc()) {
1750  VT = Ins[i].VT;
1751  EVT MemVT = VA.getLocVT();
1752 
1753  const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
1754  Info->setABIArgOffset(Offset + MemVT.getStoreSize());
1755  unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1756 
1757  // The first 36 bytes of the input buffer contains information about
1758  // thread group and global sizes for clover.
1759  SDValue Arg = lowerKernargMemParameter(
1760  DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1761  Chains.push_back(Arg.getValue(1));
1762 
1763  auto *ParamTy =
1764  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1766  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1767  // On SI local pointers are just offsets into LDS, so they are always
1768  // less than 16-bits. On CI and newer they could potentially be
1769  // real pointers, so we can't guarantee their size.
1770  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1771  DAG.getValueType(MVT::i16));
1772  }
1773 
1774  InVals.push_back(Arg);
1775  continue;
1776  } else if (!IsEntryFunc && VA.isMemLoc()) {
1777  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1778  InVals.push_back(Val);
1779  if (!Arg.Flags.isByVal())
1780  Chains.push_back(Val.getValue(1));
1781  continue;
1782  }
1783 
1784  assert(VA.isRegLoc() && "Parameter must be in a register!");
1785 
1786  unsigned Reg = VA.getLocReg();
1787  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1788  EVT ValVT = VA.getValVT();
1789 
1790  Reg = MF.addLiveIn(Reg, RC);
1791  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1792 
1793  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1794  // The return object should be reasonably addressable.
1795 
1796  // FIXME: This helps when the return is a real sret. If it is a
1797  // automatically inserted sret (i.e. CanLowerReturn returns false), an
1798  // extra copy is inserted in SelectionDAGBuilder which obscures this.
1799  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1800  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1801  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1802  }
1803 
1804  // If this is an 8 or 16-bit value, it is really passed promoted
1805  // to 32 bits. Insert an assert[sz]ext to capture this, then
1806  // truncate to the right size.
1807  switch (VA.getLocInfo()) {
1808  case CCValAssign::Full:
1809  break;
1810  case CCValAssign::BCvt:
1811  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1812  break;
1813  case CCValAssign::SExt:
1814  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1815  DAG.getValueType(ValVT));
1816  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1817  break;
1818  case CCValAssign::ZExt:
1819  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1820  DAG.getValueType(ValVT));
1821  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1822  break;
1823  case CCValAssign::AExt:
1824  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1825  break;
1826  default:
1827  llvm_unreachable("Unknown loc info!");
1828  }
1829 
1830  if (IsShader && Arg.VT.isVector()) {
1831  // Build a vector from the registers
1832  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1833  unsigned NumElements = ParamType->getVectorNumElements();
1834 
1836  Regs.push_back(Val);
1837  for (unsigned j = 1; j != NumElements; ++j) {
1838  Reg = ArgLocs[ArgIdx++].getLocReg();
1839  Reg = MF.addLiveIn(Reg, RC);
1840 
1841  SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1842  Regs.push_back(Copy);
1843  }
1844 
1845  // Fill up the missing vector elements
1846  NumElements = Arg.VT.getVectorNumElements() - NumElements;
1847  Regs.append(NumElements, DAG.getUNDEF(VT));
1848 
1849  InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
1850  continue;
1851  }
1852 
1853  InVals.push_back(Val);
1854  }
1855 
1856  if (!IsEntryFunc) {
1857  // Special inputs come after user arguments.
1858  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1859  }
1860 
1861  // Start adding system SGPRs.
1862  if (IsEntryFunc) {
1863  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1864  } else {
1865  CCInfo.AllocateReg(Info->getScratchRSrcReg());
1866  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1867  CCInfo.AllocateReg(Info->getFrameOffsetReg());
1868  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1869  }
1870 
1871  auto &ArgUsageInfo =
1873  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
1874 
1875  unsigned StackArgSize = CCInfo.getNextStackOffset();
1876  Info->setBytesInStackArgArea(StackArgSize);
1877 
1878  return Chains.empty() ? Chain :
1879  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1880 }
1881 
1882 // TODO: If return values can't fit in registers, we should return as many as
1883 // possible in registers before passing on stack.
1885  CallingConv::ID CallConv,
1886  MachineFunction &MF, bool IsVarArg,
1887  const SmallVectorImpl<ISD::OutputArg> &Outs,
1888  LLVMContext &Context) const {
1889  // Replacing returns with sret/stack usage doesn't make sense for shaders.
1890  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1891  // for shaders. Vector types should be explicitly handled by CC.
1892  if (AMDGPU::isEntryFunctionCC(CallConv))
1893  return true;
1894 
1896  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1897  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
1898 }
1899 
1900 SDValue
1902  bool isVarArg,
1903  const SmallVectorImpl<ISD::OutputArg> &Outs,
1904  const SmallVectorImpl<SDValue> &OutVals,
1905  const SDLoc &DL, SelectionDAG &DAG) const {
1906  MachineFunction &MF = DAG.getMachineFunction();
1908 
1909  if (AMDGPU::isKernel(CallConv)) {
1910  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1911  OutVals, DL, DAG);
1912  }
1913 
1914  bool IsShader = AMDGPU::isShader(CallConv);
1915 
1916  Info->setIfReturnsVoid(Outs.size() == 0);
1917  bool IsWaveEnd = Info->returnsVoid() && IsShader;
1918 
1920  SmallVector<SDValue, 48> SplitVals;
1921 
1922  // Split vectors into their elements.
1923  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1924  const ISD::OutputArg &Out = Outs[i];
1925 
1926  if (IsShader && Out.VT.isVector()) {
1927  MVT VT = Out.VT.getVectorElementType();
1928  ISD::OutputArg NewOut = Out;
1929  NewOut.Flags.setSplit();
1930  NewOut.VT = VT;
1931 
1932  // We want the original number of vector elements here, e.g.
1933  // three or five, not four or eight.
1934  unsigned NumElements = Out.ArgVT.getVectorNumElements();
1935 
1936  for (unsigned j = 0; j != NumElements; ++j) {
1937  SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1938  DAG.getConstant(j, DL, MVT::i32));
1939  SplitVals.push_back(Elem);
1940  Splits.push_back(NewOut);
1941  NewOut.PartOffset += NewOut.VT.getStoreSize();
1942  }
1943  } else {
1944  SplitVals.push_back(OutVals[i]);
1945  Splits.push_back(Out);
1946  }
1947  }
1948 
1949  // CCValAssign - represent the assignment of the return value to a location.
1951 
1952  // CCState - Info about the registers and stack slots.
1953  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1954  *DAG.getContext());
1955 
1956  // Analyze outgoing return values.
1957  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
1958 
1959  SDValue Flag;
1960  SmallVector<SDValue, 48> RetOps;
1961  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1962 
1963  // Add return address for callable functions.
1964  if (!Info->isEntryFunction()) {
1966  SDValue ReturnAddrReg = CreateLiveInRegister(
1967  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1968 
1969  // FIXME: Should be able to use a vreg here, but need a way to prevent it
1970  // from being allcoated to a CSR.
1971 
1972  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1973  MVT::i64);
1974 
1975  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1976  Flag = Chain.getValue(1);
1977 
1978  RetOps.push_back(PhysReturnAddrReg);
1979  }
1980 
1981  // Copy the result values into the output registers.
1982  for (unsigned i = 0, realRVLocIdx = 0;
1983  i != RVLocs.size();
1984  ++i, ++realRVLocIdx) {
1985  CCValAssign &VA = RVLocs[i];
1986  assert(VA.isRegLoc() && "Can only return in registers!");
1987  // TODO: Partially return in registers if return values don't fit.
1988 
1989  SDValue Arg = SplitVals[realRVLocIdx];
1990 
1991  // Copied from other backends.
1992  switch (VA.getLocInfo()) {
1993  case CCValAssign::Full:
1994  break;
1995  case CCValAssign::BCvt:
1996  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1997  break;
1998  case CCValAssign::SExt:
1999  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2000  break;
2001  case CCValAssign::ZExt:
2002  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2003  break;
2004  case CCValAssign::AExt:
2005  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2006  break;
2007  default:
2008  llvm_unreachable("Unknown loc info!");
2009  }
2010 
2011  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2012  Flag = Chain.getValue(1);
2013  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2014  }
2015 
2016  // FIXME: Does sret work properly?
2017  if (!Info->isEntryFunction()) {
2018  const SIRegisterInfo *TRI
2019  = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
2020  const MCPhysReg *I =
2022  if (I) {
2023  for (; *I; ++I) {
2024  if (AMDGPU::SReg_64RegClass.contains(*I))
2025  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2026  else if (AMDGPU::SReg_32RegClass.contains(*I))
2027  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2028  else
2029  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2030  }
2031  }
2032  }
2033 
2034  // Update chain and glue.
2035  RetOps[0] = Chain;
2036  if (Flag.getNode())
2037  RetOps.push_back(Flag);
2038 
2039  unsigned Opc = AMDGPUISD::ENDPGM;
2040  if (!IsWaveEnd)
2042  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2043 }
2044 
2046  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2047  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2048  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2049  SDValue ThisVal) const {
2050  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2051 
2052  // Assign locations to each value returned by this call.
2054  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2055  *DAG.getContext());
2056  CCInfo.AnalyzeCallResult(Ins, RetCC);
2057 
2058  // Copy all of the result registers out of their specified physreg.
2059  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2060  CCValAssign VA = RVLocs[i];
2061  SDValue Val;
2062 
2063  if (VA.isRegLoc()) {
2064  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2065  Chain = Val.getValue(1);
2066  InFlag = Val.getValue(2);
2067  } else if (VA.isMemLoc()) {
2068  report_fatal_error("TODO: return values in memory");
2069  } else
2070  llvm_unreachable("unknown argument location type");
2071 
2072  switch (VA.getLocInfo()) {
2073  case CCValAssign::Full:
2074  break;
2075  case CCValAssign::BCvt:
2076  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2077  break;
2078  case CCValAssign::ZExt:
2079  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2080  DAG.getValueType(VA.getValVT()));
2081  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2082  break;
2083  case CCValAssign::SExt:
2084  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2085  DAG.getValueType(VA.getValVT()));
2086  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2087  break;
2088  case CCValAssign::AExt:
2089  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2090  break;
2091  default:
2092  llvm_unreachable("Unknown loc info!");
2093  }
2094 
2095  InVals.push_back(Val);
2096  }
2097 
2098  return Chain;
2099 }
2100 
2101 // Add code to pass special inputs required depending on used features separate
2102 // from the explicit user arguments present in the IR.
2104  CallLoweringInfo &CLI,
2105  const SIMachineFunctionInfo &Info,
2106  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2107  SmallVectorImpl<SDValue> &MemOpChains,
2108  SDValue Chain,
2109  SDValue StackPtr) const {
2110  // If we don't have a call site, this was a call inserted by
2111  // legalization. These can never use special inputs.
2112  if (!CLI.CS)
2113  return;
2114 
2115  const Function *CalleeFunc = CLI.CS.getCalledFunction();
2116  assert(CalleeFunc);
2117 
2118  SelectionDAG &DAG = CLI.DAG;
2119  const SDLoc &DL = CLI.DL;
2120 
2121  const SISubtarget *ST = getSubtarget();
2122  const SIRegisterInfo *TRI = ST->getRegisterInfo();
2123 
2124  auto &ArgUsageInfo =
2126  const AMDGPUFunctionArgInfo &CalleeArgInfo
2127  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2128 
2129  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2130 
2131  // TODO: Unify with private memory register handling. This is complicated by
2132  // the fact that at least in kernels, the input argument is not necessarily
2133  // in the same location as the input.
2146  };
2147 
2148  for (auto InputID : InputRegs) {
2149  const ArgDescriptor *OutgoingArg;
2150  const TargetRegisterClass *ArgRC;
2151 
2152  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2153  if (!OutgoingArg)
2154  continue;
2155 
2156  const ArgDescriptor *IncomingArg;
2157  const TargetRegisterClass *IncomingArgRC;
2158  std::tie(IncomingArg, IncomingArgRC)
2159  = CallerArgInfo.getPreloadedValue(InputID);
2160  assert(IncomingArgRC == ArgRC);
2161 
2162  // All special arguments are ints for now.
2163  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2164  SDValue InputReg;
2165 
2166  if (IncomingArg) {
2167  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2168  } else {
2169  // The implicit arg ptr is special because it doesn't have a corresponding
2170  // input for kernels, and is computed from the kernarg segment pointer.
2171  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2172  InputReg = getImplicitArgPtr(DAG, DL);
2173  }
2174 
2175  if (OutgoingArg->isRegister()) {
2176  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2177  } else {
2178  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
2179  InputReg,
2180  OutgoingArg->getStackOffset());
2181  MemOpChains.push_back(ArgStore);
2182  }
2183  }
2184 }
2185 
2187  return CC == CallingConv::Fast;
2188 }
2189 
2190 /// Return true if we might ever do TCO for calls with this calling convention.
2192  switch (CC) {
2193  case CallingConv::C:
2194  return true;
2195  default:
2196  return canGuaranteeTCO(CC);
2197  }
2198 }
2199 
2201  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2202  const SmallVectorImpl<ISD::OutputArg> &Outs,
2203  const SmallVectorImpl<SDValue> &OutVals,
2204  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2205  if (!mayTailCallThisCC(CalleeCC))
2206  return false;
2207 
2208  MachineFunction &MF = DAG.getMachineFunction();
2209  const Function &CallerF = MF.getFunction();
2210  CallingConv::ID CallerCC = CallerF.getCallingConv();
2212  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2213 
2214  // Kernels aren't callable, and don't have a live in return address so it
2215  // doesn't make sense to do a tail call with entry functions.
2216  if (!CallerPreserved)
2217  return false;
2218 
2219  bool CCMatch = CallerCC == CalleeCC;
2220 
2222  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2223  return true;
2224  return false;
2225  }
2226 
2227  // TODO: Can we handle var args?
2228  if (IsVarArg)
2229  return false;
2230 
2231  for (const Argument &Arg : CallerF.args()) {
2232  if (Arg.hasByValAttr())
2233  return false;
2234  }
2235 
2236  LLVMContext &Ctx = *DAG.getContext();
2237 
2238  // Check that the call results are passed in the same way.
2239  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2240  CCAssignFnForCall(CalleeCC, IsVarArg),
2241  CCAssignFnForCall(CallerCC, IsVarArg)))
2242  return false;
2243 
2244  // The callee has to preserve all registers the caller needs to preserve.
2245  if (!CCMatch) {
2246  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2247  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2248  return false;
2249  }
2250 
2251  // Nothing more to check if the callee is taking no arguments.
2252  if (Outs.empty())
2253  return true;
2254 
2256  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2257 
2258  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2259 
2260  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2261  // If the stack arguments for this call do not fit into our own save area then
2262  // the call cannot be made tail.
2263  // TODO: Is this really necessary?
2264  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2265  return false;
2266 
2267  const MachineRegisterInfo &MRI = MF.getRegInfo();
2268  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2269 }
2270 
2272  if (!CI->isTailCall())
2273  return false;
2274 
2275  const Function *ParentFn = CI->getParent()->getParent();
2276  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2277  return false;
2278 
2279  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2280  return (Attr.getValueAsString() != "true");
2281 }
2282 
2283 // The wave scratch offset register is used as the global base pointer.
2285  SmallVectorImpl<SDValue> &InVals) const {
2286  SelectionDAG &DAG = CLI.DAG;
2287  const SDLoc &DL = CLI.DL;
2289  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2291  SDValue Chain = CLI.Chain;
2292  SDValue Callee = CLI.Callee;
2293  bool &IsTailCall = CLI.IsTailCall;
2294  CallingConv::ID CallConv = CLI.CallConv;
2295  bool IsVarArg = CLI.IsVarArg;
2296  bool IsSibCall = false;
2297  bool IsThisReturn = false;
2298  MachineFunction &MF = DAG.getMachineFunction();
2299 
2300  if (IsVarArg) {
2301  return lowerUnhandledCall(CLI, InVals,
2302  "unsupported call to variadic function ");
2303  }
2304 
2305  if (!CLI.CS.getCalledFunction()) {
2306  return lowerUnhandledCall(CLI, InVals,
2307  "unsupported indirect call to function ");
2308  }
2309 
2310  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2311  return lowerUnhandledCall(CLI, InVals,
2312  "unsupported required tail call to function ");
2313  }
2314 
2315  // The first 4 bytes are reserved for the callee's emergency stack slot.
2316  const unsigned CalleeUsableStackOffset = 4;
2317 
2318  if (IsTailCall) {
2319  IsTailCall = isEligibleForTailCallOptimization(
2320  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2321  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2322  report_fatal_error("failed to perform tail call elimination on a call "
2323  "site marked musttail");
2324  }
2325 
2326  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2327 
2328  // A sibling call is one where we're under the usual C ABI and not planning
2329  // to change that but can still do a tail call:
2330  if (!TailCallOpt && IsTailCall)
2331  IsSibCall = true;
2332 
2333  if (IsTailCall)
2334  ++NumTailCalls;
2335  }
2336 
2337  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
2338  // FIXME: Remove this hack for function pointer types after removing
2339  // support of old address space mapping. In the new address space
2340  // mapping the pointer in default address space is 64 bit, therefore
2341  // does not need this hack.
2342  if (Callee.getValueType() == MVT::i32) {
2343  const GlobalValue *GV = GA->getGlobal();
2344  Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
2345  GA->getTargetFlags());
2346  }
2347  }
2348  assert(Callee.getValueType() == MVT::i64);
2349 
2351 
2352  // Analyze operands of the call, assigning locations to each operand.
2354  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2355  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2356  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2357 
2358  // Get a count of how many bytes are to be pushed on the stack.
2359  unsigned NumBytes = CCInfo.getNextStackOffset();
2360 
2361  if (IsSibCall) {
2362  // Since we're not changing the ABI to make this a tail call, the memory
2363  // operands are already available in the caller's incoming argument space.
2364  NumBytes = 0;
2365  }
2366 
2367  // FPDiff is the byte offset of the call's argument area from the callee's.
2368  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2369  // by this amount for a tail call. In a sibling call it must be 0 because the
2370  // caller will deallocate the entire stack and the callee still expects its
2371  // arguments to begin at SP+0. Completely unused for non-tail calls.
2372  int32_t FPDiff = 0;
2373  MachineFrameInfo &MFI = MF.getFrameInfo();
2375 
2376  SDValue CallerSavedFP;
2377 
2378  // Adjust the stack pointer for the new arguments...
2379  // These operations are automatically eliminated by the prolog/epilog pass
2380  if (!IsSibCall) {
2381  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2382 
2383  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2384 
2385  // In the HSA case, this should be an identity copy.
2386  SDValue ScratchRSrcReg
2387  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2388  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2389 
2390  // TODO: Don't hardcode these registers and get from the callee function.
2391  SDValue ScratchWaveOffsetReg
2392  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2393  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2394 
2395  if (!Info->isEntryFunction()) {
2396  // Avoid clobbering this function's FP value. In the current convention
2397  // callee will overwrite this, so do save/restore around the call site.
2398  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2399  Info->getFrameOffsetReg(), MVT::i32);
2400  }
2401  }
2402 
2403  // Stack pointer relative accesses are done by changing the offset SGPR. This
2404  // is just the VGPR offset component.
2405  SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
2406 
2407  SmallVector<SDValue, 8> MemOpChains;
2408  MVT PtrVT = MVT::i32;
2409 
2410  // Walk the register/memloc assignments, inserting copies/loads.
2411  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2412  ++i, ++realArgIdx) {
2413  CCValAssign &VA = ArgLocs[i];
2414  SDValue Arg = OutVals[realArgIdx];
2415 
2416  // Promote the value if needed.
2417  switch (VA.getLocInfo()) {
2418  case CCValAssign::Full:
2419  break;
2420  case CCValAssign::BCvt:
2421  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2422  break;
2423  case CCValAssign::ZExt:
2424  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2425  break;
2426  case CCValAssign::SExt:
2427  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2428  break;
2429  case CCValAssign::AExt:
2430  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2431  break;
2432  case CCValAssign::FPExt:
2433  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2434  break;
2435  default:
2436  llvm_unreachable("Unknown loc info!");
2437  }
2438 
2439  if (VA.isRegLoc()) {
2440  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2441  } else {
2442  assert(VA.isMemLoc());
2443 
2444  SDValue DstAddr;
2445  MachinePointerInfo DstInfo;
2446 
2447  unsigned LocMemOffset = VA.getLocMemOffset();
2448  int32_t Offset = LocMemOffset;
2449 
2450  SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
2451 
2452  if (IsTailCall) {
2453  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2454  unsigned OpSize = Flags.isByVal() ?
2455  Flags.getByValSize() : VA.getValVT().getStoreSize();
2456 
2457  Offset = Offset + FPDiff;
2458  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2459 
2460  DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
2461  StackPtr);
2462  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2463 
2464  // Make sure any stack arguments overlapping with where we're storing
2465  // are loaded before this eventual operation. Otherwise they'll be
2466  // clobbered.
2467 
2468  // FIXME: Why is this really necessary? This seems to just result in a
2469  // lot of code to copy the stack and write them back to the same
2470  // locations, which are supposed to be immutable?
2471  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2472  } else {
2473  DstAddr = PtrOff;
2474  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2475  }
2476 
2477  if (Outs[i].Flags.isByVal()) {
2478  SDValue SizeNode =
2479  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2480  SDValue Cpy = DAG.getMemcpy(
2481  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2482  /*isVol = */ false, /*AlwaysInline = */ true,
2483  /*isTailCall = */ false, DstInfo,
2485  *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
2486 
2487  MemOpChains.push_back(Cpy);
2488  } else {
2489  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
2490  MemOpChains.push_back(Store);
2491  }
2492  }
2493  }
2494 
2495  // Copy special input registers after user input arguments.
2496  passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
2497 
2498  if (!MemOpChains.empty())
2499  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2500 
2501  // Build a sequence of copy-to-reg nodes chained together with token chain
2502  // and flag operands which copy the outgoing args into the appropriate regs.
2503  SDValue InFlag;
2504  for (auto &RegToPass : RegsToPass) {
2505  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2506  RegToPass.second, InFlag);
2507  InFlag = Chain.getValue(1);
2508  }
2509 
2510 
2511  SDValue PhysReturnAddrReg;
2512  if (IsTailCall) {
2513  // Since the return is being combined with the call, we need to pass on the
2514  // return address.
2515 
2517  SDValue ReturnAddrReg = CreateLiveInRegister(
2518  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2519 
2520  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2521  MVT::i64);
2522  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2523  InFlag = Chain.getValue(1);
2524  }
2525 
2526  // We don't usually want to end the call-sequence here because we would tidy
2527  // the frame up *after* the call, however in the ABI-changing tail-call case
2528  // we've carefully laid out the parameters so that when sp is reset they'll be
2529  // in the correct location.
2530  if (IsTailCall && !IsSibCall) {
2531  Chain = DAG.getCALLSEQ_END(Chain,
2532  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2533  DAG.getTargetConstant(0, DL, MVT::i32),
2534  InFlag, DL);
2535  InFlag = Chain.getValue(1);
2536  }
2537 
2538  std::vector<SDValue> Ops;
2539  Ops.push_back(Chain);
2540  Ops.push_back(Callee);
2541 
2542  if (IsTailCall) {
2543  // Each tail call may have to adjust the stack by a different amount, so
2544  // this information must travel along with the operation for eventual
2545  // consumption by emitEpilogue.
2546  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2547 
2548  Ops.push_back(PhysReturnAddrReg);
2549  }
2550 
2551  // Add argument registers to the end of the list so that they are known live
2552  // into the call.
2553  for (auto &RegToPass : RegsToPass) {
2554  Ops.push_back(DAG.getRegister(RegToPass.first,
2555  RegToPass.second.getValueType()));
2556  }
2557 
2558  // Add a register mask operand representing the call-preserved registers.
2559 
2561  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2562  assert(Mask && "Missing call preserved mask for calling convention");
2563  Ops.push_back(DAG.getRegisterMask(Mask));
2564 
2565  if (InFlag.getNode())
2566  Ops.push_back(InFlag);
2567 
2568  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2569 
2570  // If we're doing a tall call, use a TC_RETURN here rather than an
2571  // actual call instruction.
2572  if (IsTailCall) {
2573  MFI.setHasTailCall();
2574  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2575  }
2576 
2577  // Returns a chain and a flag for retval copy to use.
2578  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2579  Chain = Call.getValue(0);
2580  InFlag = Call.getValue(1);
2581 
2582  if (CallerSavedFP) {
2583  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2584  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2585  InFlag = Chain.getValue(1);
2586  }
2587 
2588  uint64_t CalleePopBytes = NumBytes;
2589  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2590  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2591  InFlag, DL);
2592  if (!Ins.empty())
2593  InFlag = Chain.getValue(1);
2594 
2595  // Handle result values, copying them out of physregs into vregs that we
2596  // return.
2597  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2598  InVals, IsThisReturn,
2599  IsThisReturn ? OutVals[0] : SDValue());
2600 }
2601 
2602 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2603  SelectionDAG &DAG) const {
2604  unsigned Reg = StringSwitch<unsigned>(RegName)
2605  .Case("m0", AMDGPU::M0)
2606  .Case("exec", AMDGPU::EXEC)
2607  .Case("exec_lo", AMDGPU::EXEC_LO)
2608  .Case("exec_hi", AMDGPU::EXEC_HI)
2609  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2610  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2611  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2612  .Default(AMDGPU::NoRegister);
2613 
2614  if (Reg == AMDGPU::NoRegister) {
2615  report_fatal_error(Twine("invalid register name \""
2616  + StringRef(RegName) + "\"."));
2617 
2618  }
2619 
2621  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2622  report_fatal_error(Twine("invalid register \""
2623  + StringRef(RegName) + "\" for subtarget."));
2624  }
2625 
2626  switch (Reg) {
2627  case AMDGPU::M0:
2628  case AMDGPU::EXEC_LO:
2629  case AMDGPU::EXEC_HI:
2630  case AMDGPU::FLAT_SCR_LO:
2631  case AMDGPU::FLAT_SCR_HI:
2632  if (VT.getSizeInBits() == 32)
2633  return Reg;
2634  break;
2635  case AMDGPU::EXEC:
2636  case AMDGPU::FLAT_SCR:
2637  if (VT.getSizeInBits() == 64)
2638  return Reg;
2639  break;
2640  default:
2641  llvm_unreachable("missing register type checking");
2642  }
2643 
2644  report_fatal_error(Twine("invalid type for register \""
2645  + StringRef(RegName) + "\"."));
2646 }
2647 
2648 // If kill is not the last instruction, split the block so kill is always a
2649 // proper terminator.
2651  MachineBasicBlock *BB) const {
2652  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2653 
2654  MachineBasicBlock::iterator SplitPoint(&MI);
2655  ++SplitPoint;
2656 
2657  if (SplitPoint == BB->end()) {
2658  // Don't bother with a new block.
2660  return BB;
2661  }
2662 
2663  MachineFunction *MF = BB->getParent();
2664  MachineBasicBlock *SplitBB
2666 
2667  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2668  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2669 
2670  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2671  BB->addSuccessor(SplitBB);
2672 
2674  return SplitBB;
2675 }
2676 
2677 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2678 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2679 // will only do one iteration. In the worst case, this will loop 64 times.
2680 //
2681 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2683  const SIInstrInfo *TII,
2685  MachineBasicBlock &OrigBB,
2686  MachineBasicBlock &LoopBB,
2687  const DebugLoc &DL,
2688  const MachineOperand &IdxReg,
2689  unsigned InitReg,
2690  unsigned ResultReg,
2691  unsigned PhiReg,
2692  unsigned InitSaveExecReg,
2693  int Offset,
2694  bool UseGPRIdxMode,
2695  bool IsIndirectSrc) {
2696  MachineBasicBlock::iterator I = LoopBB.begin();
2697 
2698  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2699  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2700  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2701  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2702 
2703  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2704  .addReg(InitReg)
2705  .addMBB(&OrigBB)
2706  .addReg(ResultReg)
2707  .addMBB(&LoopBB);
2708 
2709  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2710  .addReg(InitSaveExecReg)
2711  .addMBB(&OrigBB)
2712  .addReg(NewExec)
2713  .addMBB(&LoopBB);
2714 
2715  // Read the next variant <- also loop target.
2716  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2717  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2718 
2719  // Compare the just read M0 value to all possible Idx values.
2720  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2721  .addReg(CurrentIdxReg)
2722  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2723 
2724  // Update EXEC, save the original EXEC value to VCC.
2725  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2726  .addReg(CondReg, RegState::Kill);
2727 
2728  MRI.setSimpleHint(NewExec, CondReg);
2729 
2730  if (UseGPRIdxMode) {
2731  unsigned IdxReg;
2732  if (Offset == 0) {
2733  IdxReg = CurrentIdxReg;
2734  } else {
2735  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2736  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2737  .addReg(CurrentIdxReg, RegState::Kill)
2738  .addImm(Offset);
2739  }
2740  unsigned IdxMode = IsIndirectSrc ?
2742  MachineInstr *SetOn =
2743  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2744  .addReg(IdxReg, RegState::Kill)
2745  .addImm(IdxMode);
2746  SetOn->getOperand(3).setIsUndef();
2747  } else {
2748  // Move index from VCC into M0
2749  if (Offset == 0) {
2750  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2751  .addReg(CurrentIdxReg, RegState::Kill);
2752  } else {
2753  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2754  .addReg(CurrentIdxReg, RegState::Kill)
2755  .addImm(Offset);
2756  }
2757  }
2758 
2759  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2760  MachineInstr *InsertPt =
2761  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2762  .addReg(AMDGPU::EXEC)
2763  .addReg(NewExec);
2764 
2765  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2766  // s_cbranch_scc0?
2767 
2768  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2769  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2770  .addMBB(&LoopBB);
2771 
2772  return InsertPt->getIterator();
2773 }
2774 
2775 // This has slightly sub-optimal regalloc when the source vector is killed by
2776 // the read. The register allocator does not understand that the kill is
2777 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2778 // subregister from it, using 1 more VGPR than necessary. This was saved when
2779 // this was expanded after register allocation.
2781  MachineBasicBlock &MBB,
2782  MachineInstr &MI,
2783  unsigned InitResultReg,
2784  unsigned PhiReg,
2785  int Offset,
2786  bool UseGPRIdxMode,
2787  bool IsIndirectSrc) {
2788  MachineFunction *MF = MBB.getParent();
2790  const DebugLoc &DL = MI.getDebugLoc();
2792 
2793  unsigned DstReg = MI.getOperand(0).getReg();
2794  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2795  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2796 
2797  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2798 
2799  // Save the EXEC mask
2800  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2801  .addReg(AMDGPU::EXEC);
2802 
2803  // To insert the loop we need to split the block. Move everything after this
2804  // point to a new block, and insert a new empty block between the two.
2806  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2807  MachineFunction::iterator MBBI(MBB);
2808  ++MBBI;
2809 
2810  MF->insert(MBBI, LoopBB);
2811  MF->insert(MBBI, RemainderBB);
2812 
2813  LoopBB->addSuccessor(LoopBB);
2814  LoopBB->addSuccessor(RemainderBB);
2815 
2816  // Move the rest of the block into a new block.
2817  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2818  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2819 
2820  MBB.addSuccessor(LoopBB);
2821 
2822  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2823 
2824  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2825  InitResultReg, DstReg, PhiReg, TmpExec,
2826  Offset, UseGPRIdxMode, IsIndirectSrc);
2827 
2828  MachineBasicBlock::iterator First = RemainderBB->begin();
2829  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2830  .addReg(SaveExec);
2831 
2832  return InsPt;
2833 }
2834 
2835 // Returns subreg index, offset
2836 static std::pair<unsigned, int>
2838  const TargetRegisterClass *SuperRC,
2839  unsigned VecReg,
2840  int Offset) {
2841  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2842 
2843  // Skip out of bounds offsets, or else we would end up using an undefined
2844  // register.
2845  if (Offset >= NumElts || Offset < 0)
2846  return std::make_pair(AMDGPU::sub0, Offset);
2847 
2848  return std::make_pair(AMDGPU::sub0 + Offset, 0);
2849 }
2850 
2851 // Return true if the index is an SGPR and was set.
2854  MachineInstr &MI,
2855  int Offset,
2856  bool UseGPRIdxMode,
2857  bool IsIndirectSrc) {
2858  MachineBasicBlock *MBB = MI.getParent();
2859  const DebugLoc &DL = MI.getDebugLoc();
2861 
2862  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2863  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2864 
2865  assert(Idx->getReg() != AMDGPU::NoRegister);
2866 
2867  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2868  return false;
2869 
2870  if (UseGPRIdxMode) {
2871  unsigned IdxMode = IsIndirectSrc ?
2873  if (Offset == 0) {
2874  MachineInstr *SetOn =
2875  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2876  .add(*Idx)
2877  .addImm(IdxMode);
2878 
2879  SetOn->getOperand(3).setIsUndef();
2880  } else {
2881  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2882  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2883  .add(*Idx)
2884  .addImm(Offset);
2885  MachineInstr *SetOn =
2886  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2887  .addReg(Tmp, RegState::Kill)
2888  .addImm(IdxMode);
2889 
2890  SetOn->getOperand(3).setIsUndef();
2891  }
2892 
2893  return true;
2894  }
2895 
2896  if (Offset == 0) {
2897  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2898  .add(*Idx);
2899  } else {
2900  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2901  .add(*Idx)
2902  .addImm(Offset);
2903  }
2904 
2905  return true;
2906 }
2907 
2908 // Control flow needs to be inserted if indexing with a VGPR.
2910  MachineBasicBlock &MBB,
2911  const SISubtarget &ST) {
2912  const SIInstrInfo *TII = ST.getInstrInfo();
2913  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2914  MachineFunction *MF = MBB.getParent();
2916 
2917  unsigned Dst = MI.getOperand(0).getReg();
2918  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2919  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2920 
2921  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2922 
2923  unsigned SubReg;
2924  std::tie(SubReg, Offset)
2925  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
2926 
2927  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2928 
2929  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
2931  const DebugLoc &DL = MI.getDebugLoc();
2932 
2933  if (UseGPRIdxMode) {
2934  // TODO: Look at the uses to avoid the copy. This may require rescheduling
2935  // to avoid interfering with other uses, so probably requires a new
2936  // optimization pass.
2937  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2938  .addReg(SrcReg, RegState::Undef, SubReg)
2939  .addReg(SrcReg, RegState::Implicit)
2940  .addReg(AMDGPU::M0, RegState::Implicit);
2941  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2942  } else {
2943  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2944  .addReg(SrcReg, RegState::Undef, SubReg)
2945  .addReg(SrcReg, RegState::Implicit);
2946  }
2947 
2948  MI.eraseFromParent();
2949 
2950  return &MBB;
2951  }
2952 
2953  const DebugLoc &DL = MI.getDebugLoc();
2955 
2956  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2957  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2958 
2959  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
2960 
2961  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
2962  Offset, UseGPRIdxMode, true);
2963  MachineBasicBlock *LoopBB = InsPt->getParent();
2964 
2965  if (UseGPRIdxMode) {
2966  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2967  .addReg(SrcReg, RegState::Undef, SubReg)
2968  .addReg(SrcReg, RegState::Implicit)
2969  .addReg(AMDGPU::M0, RegState::Implicit);
2970  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2971  } else {
2972  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2973  .addReg(SrcReg, RegState::Undef, SubReg)
2974  .addReg(SrcReg, RegState::Implicit);
2975  }
2976 
2977  MI.eraseFromParent();
2978 
2979  return LoopBB;
2980 }
2981 
2982 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
2983  const TargetRegisterClass *VecRC) {
2984  switch (TRI.getRegSizeInBits(*VecRC)) {
2985  case 32: // 4 bytes
2986  return AMDGPU::V_MOVRELD_B32_V1;
2987  case 64: // 8 bytes
2988  return AMDGPU::V_MOVRELD_B32_V2;
2989  case 128: // 16 bytes
2990  return AMDGPU::V_MOVRELD_B32_V4;
2991  case 256: // 32 bytes
2992  return AMDGPU::V_MOVRELD_B32_V8;
2993  case 512: // 64 bytes
2994  return AMDGPU::V_MOVRELD_B32_V16;
2995  default:
2996  llvm_unreachable("unsupported size for MOVRELD pseudos");
2997  }
2998 }
2999 
3001  MachineBasicBlock &MBB,
3002  const SISubtarget &ST) {
3003  const SIInstrInfo *TII = ST.getInstrInfo();
3004  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3005  MachineFunction *MF = MBB.getParent();
3007 
3008  unsigned Dst = MI.getOperand(0).getReg();
3009  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3010  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3011  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3012  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3013  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3014 
3015  // This can be an immediate, but will be folded later.
3016  assert(Val->getReg());
3017 
3018  unsigned SubReg;
3019  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3020  SrcVec->getReg(),
3021  Offset);
3022  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3023 
3024  if (Idx->getReg() == AMDGPU::NoRegister) {
3026  const DebugLoc &DL = MI.getDebugLoc();
3027 
3028  assert(Offset == 0);
3029 
3030  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3031  .add(*SrcVec)
3032  .add(*Val)
3033  .addImm(SubReg);
3034 
3035  MI.eraseFromParent();
3036  return &MBB;
3037  }
3038 
3039  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3041  const DebugLoc &DL = MI.getDebugLoc();
3042 
3043  if (UseGPRIdxMode) {
3044  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3045  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3046  .add(*Val)
3047  .addReg(Dst, RegState::ImplicitDefine)
3048  .addReg(SrcVec->getReg(), RegState::Implicit)
3049  .addReg(AMDGPU::M0, RegState::Implicit);
3050 
3051  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3052  } else {
3053  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3054 
3055  BuildMI(MBB, I, DL, MovRelDesc)
3056  .addReg(Dst, RegState::Define)
3057  .addReg(SrcVec->getReg())
3058  .add(*Val)
3059  .addImm(SubReg - AMDGPU::sub0);
3060  }
3061 
3062  MI.eraseFromParent();
3063  return &MBB;
3064  }
3065 
3066  if (Val->isReg())
3067  MRI.clearKillFlags(Val->getReg());
3068 
3069  const DebugLoc &DL = MI.getDebugLoc();
3070 
3071  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3072 
3073  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3074  Offset, UseGPRIdxMode, false);
3075  MachineBasicBlock *LoopBB = InsPt->getParent();
3076 
3077  if (UseGPRIdxMode) {
3078  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3079  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3080  .add(*Val) // src0
3082  .addReg(PhiReg, RegState::Implicit)
3083  .addReg(AMDGPU::M0, RegState::Implicit);
3084  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3085  } else {
3086  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3087 
3088  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3089  .addReg(Dst, RegState::Define)
3090  .addReg(PhiReg)
3091  .add(*Val)
3092  .addImm(SubReg - AMDGPU::sub0);
3093  }
3094 
3095  MI.eraseFromParent();
3096 
3097  return LoopBB;
3098 }
3099 
3101  MachineInstr &MI, MachineBasicBlock *BB) const {
3102 
3103  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3104  MachineFunction *MF = BB->getParent();
3106 
3107  if (TII->isMIMG(MI)) {
3108  if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3109  report_fatal_error("missing mem operand from MIMG instruction");
3110  }
3111  // Add a memoperand for mimg instructions so that they aren't assumed to
3112  // be ordered memory instuctions.
3113 
3114  return BB;
3115  }
3116 
3117  switch (MI.getOpcode()) {
3118  case AMDGPU::S_ADD_U64_PSEUDO:
3119  case AMDGPU::S_SUB_U64_PSEUDO: {
3121  const DebugLoc &DL = MI.getDebugLoc();
3122 
3123  MachineOperand &Dest = MI.getOperand(0);
3124  MachineOperand &Src0 = MI.getOperand(1);
3125  MachineOperand &Src1 = MI.getOperand(2);
3126 
3127  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3128  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3129 
3130  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3131  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3132  &AMDGPU::SReg_32_XM0RegClass);
3133  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3134  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3135  &AMDGPU::SReg_32_XM0RegClass);
3136 
3137  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3138  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3139  &AMDGPU::SReg_32_XM0RegClass);
3140  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3141  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3142  &AMDGPU::SReg_32_XM0RegClass);
3143 
3144  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3145 
3146  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3147  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3148  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3149  .add(Src0Sub0)
3150  .add(Src1Sub0);
3151  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3152  .add(Src0Sub1)
3153  .add(Src1Sub1);
3154  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3155  .addReg(DestSub0)
3156  .addImm(AMDGPU::sub0)
3157  .addReg(DestSub1)
3158  .addImm(AMDGPU::sub1);
3159  MI.eraseFromParent();
3160  return BB;
3161  }
3162  case AMDGPU::SI_INIT_M0: {
3163  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3164  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3165  .add(MI.getOperand(0));
3166  MI.eraseFromParent();
3167  return BB;
3168  }
3169  case AMDGPU::SI_INIT_EXEC:
3170  // This should be before all vector instructions.
3171  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3172  AMDGPU::EXEC)
3173  .addImm(MI.getOperand(0).getImm());
3174  MI.eraseFromParent();
3175  return BB;
3176 
3177  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3178  // Extract the thread count from an SGPR input and set EXEC accordingly.
3179  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3180  //
3181  // S_BFE_U32 count, input, {shift, 7}
3182  // S_BFM_B64 exec, count, 0
3183  // S_CMP_EQ_U32 count, 64
3184  // S_CMOV_B64 exec, -1
3185  MachineInstr *FirstMI = &*BB->begin();
3187  unsigned InputReg = MI.getOperand(0).getReg();
3188  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3189  bool Found = false;
3190 
3191  // Move the COPY of the input reg to the beginning, so that we can use it.
3192  for (auto I = BB->begin(); I != &MI; I++) {
3193  if (I->getOpcode() != TargetOpcode::COPY ||
3194  I->getOperand(0).getReg() != InputReg)
3195  continue;
3196 
3197  if (I == FirstMI) {
3198  FirstMI = &*++BB->begin();
3199  } else {
3200  I->removeFromParent();
3201  BB->insert(FirstMI, &*I);
3202  }
3203  Found = true;
3204  break;
3205  }
3206  assert(Found);
3207  (void)Found;
3208 
3209  // This should be before all vector instructions.
3210  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3211  .addReg(InputReg)
3212  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3213  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3214  AMDGPU::EXEC)
3215  .addReg(CountReg)
3216  .addImm(0);
3217  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3218  .addReg(CountReg, RegState::Kill)
3219  .addImm(64);
3220  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3221  AMDGPU::EXEC)
3222  .addImm(-1);
3223  MI.eraseFromParent();
3224  return BB;
3225  }
3226 
3227  case AMDGPU::GET_GROUPSTATICSIZE: {
3228  DebugLoc DL = MI.getDebugLoc();
3229  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3230  .add(MI.getOperand(0))
3231  .addImm(MFI->getLDSSize());
3232  MI.eraseFromParent();
3233  return BB;
3234  }
3235  case AMDGPU::SI_INDIRECT_SRC_V1:
3236  case AMDGPU::SI_INDIRECT_SRC_V2:
3237  case AMDGPU::SI_INDIRECT_SRC_V4:
3238  case AMDGPU::SI_INDIRECT_SRC_V8:
3239  case AMDGPU::SI_INDIRECT_SRC_V16:
3240  return emitIndirectSrc(MI, *BB, *getSubtarget());
3241  case AMDGPU::SI_INDIRECT_DST_V1:
3242  case AMDGPU::SI_INDIRECT_DST_V2:
3243  case AMDGPU::SI_INDIRECT_DST_V4:
3244  case AMDGPU::SI_INDIRECT_DST_V8:
3245  case AMDGPU::SI_INDIRECT_DST_V16:
3246  return emitIndirectDst(MI, *BB, *getSubtarget());
3247  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3248  case AMDGPU::SI_KILL_I1_PSEUDO:
3249  return splitKillBlock(MI, BB);
3250  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3252 
3253  unsigned Dst = MI.getOperand(0).getReg();
3254  unsigned Src0 = MI.getOperand(1).getReg();
3255  unsigned Src1 = MI.getOperand(2).getReg();
3256  const DebugLoc &DL = MI.getDebugLoc();
3257  unsigned SrcCond = MI.getOperand(3).getReg();
3258 
3259  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3260  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3261  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3262 
3263  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3264  .addReg(SrcCond);
3265  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3266  .addReg(Src0, 0, AMDGPU::sub0)
3267  .addReg(Src1, 0, AMDGPU::sub0)
3268  .addReg(SrcCondCopy);
3269  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3270  .addReg(Src0, 0, AMDGPU::sub1)
3271  .addReg(Src1, 0, AMDGPU::sub1)
3272  .addReg(SrcCondCopy);
3273 
3274  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3275  .addReg(DstLo)
3276  .addImm(AMDGPU::sub0)
3277  .addReg(DstHi)
3278  .addImm(AMDGPU::sub1);
3279  MI.eraseFromParent();
3280  return BB;
3281  }
3282  case AMDGPU::SI_BR_UNDEF: {
3283  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3284  const DebugLoc &DL = MI.getDebugLoc();
3285  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3286  .add(MI.getOperand(0));
3287  Br->getOperand(1).setIsUndef(true); // read undef SCC
3288  MI.eraseFromParent();
3289  return BB;
3290  }
3291  case AMDGPU::ADJCALLSTACKUP:
3292  case AMDGPU::ADJCALLSTACKDOWN: {
3294  MachineInstrBuilder MIB(*MF, &MI);
3295 
3296  // Add an implicit use of the frame offset reg to prevent the restore copy
3297  // inserted after the call from being reorderd after stack operations in the
3298  // the caller's frame.
3299  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3300  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3301  .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3302  return BB;
3303  }
3304  case AMDGPU::SI_CALL_ISEL:
3305  case AMDGPU::SI_TCRETURN_ISEL: {
3306  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3307  const DebugLoc &DL = MI.getDebugLoc();
3308  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3309 
3311  unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3312  MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3313  assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3314 
3315  const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3316 
3317  MachineInstrBuilder MIB;
3318  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3319  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3320  .add(MI.getOperand(0))
3321  .addGlobalAddress(G);
3322  } else {
3323  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3324  .add(MI.getOperand(0))
3325  .addGlobalAddress(G);
3326 
3327  // There is an additional imm operand for tcreturn, but it should be in the
3328  // right place already.
3329  }
3330 
3331  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3332  MIB.add(MI.getOperand(I));
3333 
3335  MI.eraseFromParent();
3336  return BB;
3337  }
3338  default:
3340  }
3341 }
3342 
3344  return isTypeLegal(VT.getScalarType());
3345 }
3346 
3348  // This currently forces unfolding various combinations of fsub into fma with
3349  // free fneg'd operands. As long as we have fast FMA (controlled by
3350  // isFMAFasterThanFMulAndFAdd), we should perform these.
3351 
3352  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3353  // most of these combines appear to be cycle neutral but save on instruction
3354  // count / code size.
3355  return true;
3356 }
3357 
3359  EVT VT) const {
3360  if (!VT.isVector()) {
3361  return MVT::i1;
3362  }
3363  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3364 }
3365 
3367  // TODO: Should i16 be used always if legal? For now it would force VALU
3368  // shifts.
3369  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3370 }
3371 
3372 // Answering this is somewhat tricky and depends on the specific device which
3373 // have different rates for fma or all f64 operations.
3374 //
3375 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3376 // regardless of which device (although the number of cycles differs between
3377 // devices), so it is always profitable for f64.
3378 //
3379 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3380 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3381 // which we can always do even without fused FP ops since it returns the same
3382 // result as the separate operations and since it is always full
3383 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3384 // however does not support denormals, so we do report fma as faster if we have
3385 // a fast fma device and require denormals.
3386 //
3388  VT = VT.getScalarType();
3389 
3390  switch (VT.getSimpleVT().SimpleTy) {
3391  case MVT::f32: {
3392  // This is as fast on some subtargets. However, we always have full rate f32
3393  // mad available which returns the same result as the separate operations
3394  // which we should prefer over fma. We can't use this if we want to support
3395  // denormals, so only report this in these cases.
3396  if (Subtarget->hasFP32Denormals())
3397  return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3398 
3399  // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3400  return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3401  }
3402  case MVT::f64:
3403  return true;
3404  case MVT::f16:
3406  default:
3407  break;
3408  }
3409 
3410  return false;
3411 }
3412 
3413 //===----------------------------------------------------------------------===//
3414 // Custom DAG Lowering Operations
3415 //===----------------------------------------------------------------------===//
3416 
3417 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3418 // wider vector type is legal.
3420  SelectionDAG &DAG) const {
3421  unsigned Opc = Op.getOpcode();
3422  EVT VT = Op.getValueType();
3423  assert(VT == MVT::v4f16);
3424 
3425  SDValue Lo, Hi;
3426  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3427 
3428  SDLoc SL(Op);
3429  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3430  Op->getFlags());
3431  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3432  Op->getFlags());
3433 
3434  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3435 }
3436 
3437 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3438 // wider vector type is legal.
3440  SelectionDAG &DAG) const {
3441  unsigned Opc = Op.getOpcode();
3442  EVT VT = Op.getValueType();
3443  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3444 
3445  SDValue Lo0, Hi0;
3446  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3447  SDValue Lo1, Hi1;
3448  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3449 
3450  SDLoc SL(Op);
3451 
3452  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3453  Op->getFlags());
3454  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3455  Op->getFlags());
3456 
3457  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3458 }
3459 
3461  switch (Op.getOpcode()) {
3462  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3463  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3464  case ISD::LOAD: {
3465  SDValue Result = LowerLOAD(Op, DAG);
3466  assert((!Result.getNode() ||
3467  Result.getNode()->getNumValues() == 2) &&
3468  "Load should return a value and a chain");
3469  return Result;
3470  }
3471 
3472  case ISD::FSIN:
3473  case ISD::FCOS:
3474  return LowerTrig(Op, DAG);
3475  case ISD::SELECT: return LowerSELECT(Op, DAG);
3476  case ISD::FDIV: return LowerFDIV(Op, DAG);
3477  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3478  case ISD::STORE: return LowerSTORE(Op, DAG);
3479  case ISD::GlobalAddress: {
3480  MachineFunction &MF = DAG.getMachineFunction();
3482  return LowerGlobalAddress(MFI, Op, DAG);
3483  }
3484  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3485  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3486  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3487  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3489  return lowerINSERT_VECTOR_ELT(Op, DAG);
3491  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3492  case ISD::BUILD_VECTOR:
3493  return lowerBUILD_VECTOR(Op, DAG);
3494  case ISD::FP_ROUND:
3495  return lowerFP_ROUND(Op, DAG);
3496  case ISD::TRAP:
3497  return lowerTRAP(Op, DAG);
3498  case ISD::DEBUGTRAP:
3499  return lowerDEBUGTRAP(Op, DAG);
3500  case ISD::FABS:
3501  case ISD::FNEG:
3502  return splitUnaryVectorOp(Op, DAG);
3503  case ISD::SHL:
3504  case ISD::SRA:
3505  case ISD::SRL:
3506  case ISD::ADD:
3507  case ISD::SUB:
3508  case ISD::MUL:
3509  case ISD::SMIN:
3510  case ISD::SMAX:
3511  case ISD::UMIN:
3512  case ISD::UMAX:
3513  case ISD::FMINNUM:
3514  case ISD::FMAXNUM:
3515  case ISD::FADD:
3516  case ISD::FMUL:
3517  return splitBinaryVectorOp(Op, DAG);
3518  }
3519  return SDValue();
3520 }
3521 
3523  const SDLoc &DL,
3524  SelectionDAG &DAG, bool Unpacked) {
3525  if (!LoadVT.isVector())
3526  return Result;
3527 
3528  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3529  // Truncate to v2i16/v4i16.
3530  EVT IntLoadVT = LoadVT.changeTypeToInteger();
3531 
3532  // Workaround legalizer not scalarizing truncate after vector op
3533  // legalization byt not creating intermediate vector trunc.
3535  DAG.ExtractVectorElements(Result, Elts);
3536  for (SDValue &Elt : Elts)
3537  Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3538 
3539  Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3540 
3541  // Bitcast to original type (v2f16/v4f16).
3542  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3543  }
3544 
3545  // Cast back to the original packed type.
3546  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3547 }
3548 
3549 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3550  MemSDNode *M,
3551  SelectionDAG &DAG,
3552  bool IsIntrinsic) const {
3553  SDLoc DL(M);
3555  Ops.reserve(M->getNumOperands());
3556 
3557  Ops.push_back(M->getOperand(0));
3558  if (IsIntrinsic)
3559  Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
3560 
3561  // Skip 1, as it is the intrinsic ID.
3562  for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
3563  Ops.push_back(M->getOperand(I));
3564 
3565  bool Unpacked = Subtarget->hasUnpackedD16VMem();
3566  EVT LoadVT = M->getValueType(0);
3567 
3568  EVT EquivLoadVT = LoadVT;
3569  if (Unpacked && LoadVT.isVector()) {
3570  EquivLoadVT = LoadVT.isVector() ?
3572  LoadVT.getVectorNumElements()) : LoadVT;
3573  }
3574 
3575  // Change from v4f16/v2f16 to EquivLoadVT.
3576  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3577 
3578  SDValue Load
3579  = DAG.getMemIntrinsicNode(
3580  IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3581  VTList, Ops, M->getMemoryVT(),
3582  M->getMemOperand());
3583  if (!Unpacked) // Just adjusted the opcode.
3584  return Load;
3585 
3586  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3587 
3588  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3589 }
3590 
3593  SelectionDAG &DAG) const {
3594  switch (N->getOpcode()) {
3595  case ISD::INSERT_VECTOR_ELT: {
3596  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3597  Results.push_back(Res);
3598  return;
3599  }
3600  case ISD::EXTRACT_VECTOR_ELT: {
3601  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3602  Results.push_back(Res);
3603  return;
3604  }
3605  case ISD::INTRINSIC_WO_CHAIN: {
3606  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3607  switch (IID) {
3608  case Intrinsic::amdgcn_cvt_pkrtz: {
3609  SDValue Src0 = N->getOperand(1);
3610  SDValue Src1 = N->getOperand(2);
3611  SDLoc SL(N);
3613  Src0, Src1);
3614  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3615  return;
3616  }
3617  case Intrinsic::amdgcn_cvt_pknorm_i16:
3618  case Intrinsic::amdgcn_cvt_pknorm_u16:
3619  case Intrinsic::amdgcn_cvt_pk_i16:
3620  case Intrinsic::amdgcn_cvt_pk_u16: {
3621  SDValue Src0 = N->getOperand(1);
3622  SDValue Src1 = N->getOperand(2);
3623  SDLoc SL(N);
3624  unsigned Opcode;
3625 
3626  if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3628  else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3630  else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3631  Opcode = AMDGPUISD::CVT_PK_I16_I32;
3632  else
3633  Opcode = AMDGPUISD::CVT_PK_U16_U32;
3634 
3635  SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3636  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3637  return;
3638  }
3639  }
3640  break;
3641  }
3642  case ISD::INTRINSIC_W_CHAIN: {
3643  if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3644  Results.push_back(Res);
3645  Results.push_back(Res.getValue(1));
3646  return;
3647  }
3648 
3649  break;
3650  }
3651  case ISD::SELECT: {
3652  SDLoc SL(N);
3653  EVT VT = N->getValueType(0);
3654  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3655  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3656  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3657 
3658  EVT SelectVT = NewVT;
3659  if (NewVT.bitsLT(MVT::i32)) {
3660  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3661  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3662  SelectVT = MVT::i32;
3663  }
3664 
3665  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3666  N->getOperand(0), LHS, RHS);
3667 
3668  if (NewVT != SelectVT)
3669  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3670  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3671  return;
3672  }
3673  case ISD::FNEG: {
3674  if (N->getValueType(0) != MVT::v2f16)
3675  break;
3676 
3677  SDLoc SL(N);
3678  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3679 
3680  SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3681  BC,
3682  DAG.getConstant(0x80008000, SL, MVT::i32));
3683  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3684  return;
3685  }
3686  case ISD::FABS: {
3687  if (N->getValueType(0) != MVT::v2f16)
3688  break;
3689 
3690  SDLoc SL(N);
3691  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3692 
3693  SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3694  BC,
3695  DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3696  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3697  return;
3698  }
3699  default:
3700  break;
3701  }
3702 }
3703 
3704 /// Helper function for LowerBRCOND
3705 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3706 
3707  SDNode *Parent = Value.getNode();
3708  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3709  I != E; ++I) {
3710 
3711  if (I.getUse().get() != Value)
3712  continue;
3713 
3714  if (I->getOpcode() == Opcode)
3715  return *I;
3716  }
3717  return nullptr;
3718 }
3719 
3720 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3721  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3722  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3723  case Intrinsic::amdgcn_if:
3724  return AMDGPUISD::IF;
3725  case Intrinsic::amdgcn_else:
3726  return AMDGPUISD::ELSE;
3727  case Intrinsic::amdgcn_loop:
3728  return AMDGPUISD::LOOP;
3729  case Intrinsic::amdgcn_end_cf:
3730  llvm_unreachable("should not occur");
3731  default:
3732  return 0;
3733  }
3734  }
3735 
3736  // break, if_break, else_break are all only used as inputs to loop, not
3737  // directly as branch conditions.
3738  return 0;
3739 }
3740 
3741 void SITargetLowering::createDebuggerPrologueStackObjects(
3742  MachineFunction &MF) const {
3743  // Create stack objects that are used for emitting debugger prologue.
3744  //
3745  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3746  // at fixed location in the following format:
3747  // offset 0: work group ID x
3748  // offset 4: work group ID y
3749  // offset 8: work group ID z
3750  // offset 16: work item ID x
3751  // offset 20: work item ID y
3752  // offset 24: work item ID z
3754  int ObjectIdx = 0;
3755 
3756  // For each dimension:
3757  for (unsigned i = 0; i < 3; ++i) {
3758  // Create fixed stack object for work group ID.
3759  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3760  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3761  // Create fixed stack object for work item ID.
3762  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3763  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3764  }
3765 }
3766 
3767 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3768  const Triple &TT = getTargetMachine().getTargetTriple();
3769  return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
3772 }
3773 
3774 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3775  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3778  !shouldEmitFixup(GV) &&
3780 }
3781 
3782 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3783  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3784 }
3785 
3786 /// This transforms the control flow intrinsics to get the branch destination as
3787 /// last parameter, also switches branch target with BR if the need arise
3788 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3789  SelectionDAG &DAG) const {
3790  SDLoc DL(BRCOND);
3791 
3792  SDNode *Intr = BRCOND.getOperand(1).getNode();
3793  SDValue Target = BRCOND.getOperand(2);
3794  SDNode *BR = nullptr;
3795  SDNode *SetCC = nullptr;
3796 
3797  if (Intr->getOpcode() == ISD::SETCC) {
3798  // As long as we negate the condition everything is fine
3799  SetCC = Intr;
3800  Intr = SetCC->getOperand(0).getNode();
3801 
3802  } else {
3803  // Get the target from BR if we don't negate the condition
3804  BR = findUser(BRCOND, ISD::BR);
3805  Target = BR->getOperand(1);
3806  }
3807 
3808  // FIXME: This changes the types of the intrinsics instead of introducing new
3809  // nodes with the correct types.
3810  // e.g. llvm.amdgcn.loop
3811 
3812  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3813  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3814 
3815  unsigned CFNode = isCFIntrinsic(Intr);
3816  if (CFNode == 0) {
3817  // This is a uniform branch so we don't need to legalize.
3818  return BRCOND;
3819  }
3820 
3821  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3822  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3823 
3824  assert(!SetCC ||
3825  (SetCC->getConstantOperandVal(1) == 1 &&
3826  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3827  ISD::SETNE));
3828 
3829  // operands of the new intrinsic call
3831  if (HaveChain)
3832  Ops.push_back(BRCOND.getOperand(0));
3833 
3834  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
3835  Ops.push_back(Target);
3836 
3837  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3838 
3839  // build the new intrinsic call
3840  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3841 
3842  if (!HaveChain) {
3843  SDValue Ops[] = {
3844  SDValue(Result, 0),
3845  BRCOND.getOperand(0)
3846  };
3847 
3848  Result = DAG.getMergeValues(Ops, DL).getNode();
3849  }
3850 
3851  if (BR) {
3852  // Give the branch instruction our target
3853  SDValue Ops[] = {
3854  BR->getOperand(0),
3855  BRCOND.getOperand(2)
3856  };
3857  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3858  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3859  BR = NewBR.getNode();
3860  }
3861 
3862  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3863 
3864  // Copy the intrinsic results to registers
3865  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
3867  if (!CopyToReg)
3868  continue;
3869 
3870  Chain = DAG.getCopyToReg(
3871  Chain, DL,
3872  CopyToReg->getOperand(1),
3873  SDValue(Result, i - 1),
3874  SDValue());
3875 
3876  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
3877  }
3878 
3879  // Remove the old intrinsic from the chain
3881  SDValue(Intr, Intr->getNumValues() - 1),
3882  Intr->getOperand(0));
3883 
3884  return Chain;
3885 }
3886 
3887 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
3888  SDValue Op,
3889  const SDLoc &DL,
3890  EVT VT) const {
3891  return Op.getValueType().bitsLE(VT) ?
3892  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
3893  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
3894 }
3895 
3896 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
3897  assert(Op.getValueType() == MVT::f16 &&
3898  "Do not know how to custom lower FP_ROUND for non-f16 type");
3899 
3900  SDValue Src = Op.getOperand(0);
3901  EVT SrcVT = Src.getValueType();
3902  if (SrcVT != MVT::f64)
3903  return Op;
3904 
3905  SDLoc DL(Op);
3906 
3907  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
3908  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
3909  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
3910 }
3911 
3912 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
3913  SDLoc SL(Op);
3914  SDValue Chain = Op.getOperand(0);
3915 
3918  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
3919 
3920  MachineFunction &MF = DAG.getMachineFunction();
3922  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3923  assert(UserSGPR != AMDGPU::NoRegister);
3924  SDValue QueuePtr = CreateLiveInRegister(
3925  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3926  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
3927  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
3928  QueuePtr, SDValue());
3929  SDValue Ops[] = {
3930  ToReg,
3932  SGPR01,
3933  ToReg.getValue(1)
3934  };
3935  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
3936 }
3937 
3938 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
3939  SDLoc SL(Op);
3940  SDValue Chain = Op.getOperand(0);
3941  MachineFunction &MF = DAG.getMachineFunction();
3942 
3946  "debugtrap handler not supported",
3947  Op.getDebugLoc(),
3948  DS_Warning);
3949  LLVMContext &Ctx = MF.getFunction().getContext();
3950  Ctx.diagnose(NoTrap);
3951  return Chain;
3952  }
3953 
3954  SDValue Ops[] = {
3955  Chain,
3957  };
3958  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
3959 }
3960 
3961 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
3962  SelectionDAG &DAG) const {
3963  // FIXME: Use inline constants (src_{shared, private}_base) instead.
3964  if (Subtarget->hasApertureRegs()) {
3965  unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
3968  unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
3971  unsigned Encoding =
3973  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
3974  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
3975 
3976  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
3977  SDValue ApertureReg = SDValue(
3978  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
3979  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
3980  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
3981  }
3982 
3983  MachineFunction &MF = DAG.getMachineFunction();
3985  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3986  assert(UserSGPR != AMDGPU::NoRegister);
3987 
3988  SDValue QueuePtr = CreateLiveInRegister(
3989  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3990 
3991  // Offset into amd_queue_t for group_segment_aperture_base_hi /
3992  // private_segment_aperture_base_hi.
3993  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
3994 
3995  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
3996 
3997  // TODO: Use custom target PseudoSourceValue.
3998  // TODO: We should use the value from the IR intrinsic call, but it might not
3999  // be available and how do we get it?
4002 
4003  MachinePointerInfo PtrInfo(V, StructOffset);
4004  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4005  MinAlign(64, StructOffset),
4008 }
4009 
4010 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4011  SelectionDAG &DAG) const {
4012  SDLoc SL(Op);
4013  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4014 
4015  SDValue Src = ASC->getOperand(0);
4016  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4017 
4018  const AMDGPUTargetMachine &TM =
4019  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4020 
4021  // flat -> local/private
4022  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
4023  unsigned DestAS = ASC->getDestAddressSpace();
4024 
4025  if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
4026  DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
4027  unsigned NullVal = TM.getNullPointerValue(DestAS);
4028  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4029  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4030  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4031 
4032  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4033  NonNull, Ptr, SegmentNullPtr);
4034  }
4035  }
4036 
4037  // local/private -> flat
4038  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
4039  unsigned SrcAS = ASC->getSrcAddressSpace();
4040 
4041  if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
4042  SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
4043  unsigned NullVal = TM.getNullPointerValue(SrcAS);
4044  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4045 
4046  SDValue NonNull
4047  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4048 
4049  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4050  SDValue CvtPtr
4051  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4052 
4053  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4054  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4055  FlatNullPtr);
4056  }
4057  }
4058 
4059  // global <-> flat are no-ops and never emitted.
4060 
4061  const MachineFunction &MF = DAG.getMachineFunction();
4062  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4063  MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4064  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4065 
4066  return DAG.getUNDEF(ASC->getValueType(0));
4067 }
4068 
4069 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4070  SelectionDAG &DAG) const {
4071  SDValue Vec = Op.getOperand(0);
4072  SDValue InsVal = Op.getOperand(1);
4073  SDValue Idx = Op.getOperand(2);
4074  EVT VecVT = Vec.getValueType();
4075  EVT EltVT = VecVT.getVectorElementType();
4076  unsigned VecSize = VecVT.getSizeInBits();
4077  unsigned EltSize = EltVT.getSizeInBits();
4078 
4079 
4080  assert(VecSize <= 64);
4081 
4082  unsigned NumElts = VecVT.getVectorNumElements();
4083  SDLoc SL(Op);
4084  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4085 
4086  if (NumElts == 4 && EltSize == 16 && KIdx) {
4087  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4088 
4089  SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4090  DAG.getConstant(0, SL, MVT::i32));
4091  SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4092  DAG.getConstant(1, SL, MVT::i32));
4093 
4094  SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4095  SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4096 
4097  unsigned Idx = KIdx->getZExtValue();
4098  bool InsertLo = Idx < 2;
4099  SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4100  InsertLo ? LoVec : HiVec,
4101  DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4102  DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4103 
4104  InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4105 
4106  SDValue Concat = InsertLo ?
4107  DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4108  DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4109 
4110  return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4111  }
4112 
4113  if (isa<ConstantSDNode>(Idx))
4114  return SDValue();
4115 
4116  MVT IntVT = MVT::getIntegerVT(VecSize);
4117 
4118  // Avoid stack access for dynamic indexing.
4119  SDValue Val = InsVal;
4120  if (InsVal.getValueType() == MVT::f16)
4121  Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4122 
4123  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4124  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4125 
4126  assert(isPowerOf2_32(EltSize));
4127  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4128 
4129  // Convert vector index to bit-index.
4130  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4131 
4132  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4133  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4134  DAG.getConstant(0xffff, SL, IntVT),
4135  ScaledIdx);
4136 
4137  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4138  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4139  DAG.getNOT(SL, BFM, IntVT), BCVec);
4140 
4141  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4142  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4143 }
4144 
4145 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4146  SelectionDAG &DAG) const {
4147  SDLoc SL(Op);
4148 
4149  EVT ResultVT = Op.getValueType();
4150  SDValue Vec = Op.getOperand(0);
4151  SDValue Idx = Op.getOperand(1);
4152  EVT VecVT = Vec.getValueType();
4153  unsigned VecSize = VecVT.getSizeInBits();
4154  EVT EltVT = VecVT.getVectorElementType();
4155  assert(VecSize <= 64);
4156 
4157  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4158 
4159  // Make sure we do any optimizations that will make it easier to fold
4160  // source modifiers before obscuring it with bit operations.
4161 
4162  // XXX - Why doesn't this get called when vector_shuffle is expanded?
4163  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4164  return Combined;
4165 
4166  unsigned EltSize = EltVT.getSizeInBits();
4167  assert(isPowerOf2_32(EltSize));
4168 
4169  MVT IntVT = MVT::getIntegerVT(VecSize);
4170  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4171 
4172  // Convert vector index to bit-index (* EltSize)
4173  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4174 
4175  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4176  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4177 
4178  if (ResultVT == MVT::f16) {
4179  SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4180  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4181  }
4182 
4183  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4184 }
4185 
4186 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4187  SelectionDAG &DAG) const {
4188  SDLoc SL(Op);
4189  EVT VT = Op.getValueType();
4190 
4191  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4193 
4194  // Turn into pair of packed build_vectors.
4195  // TODO: Special case for constants that can be materialized with s_mov_b64.
4196  SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4197  { Op.getOperand(0), Op.getOperand(1) });
4198  SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4199  { Op.getOperand(2), Op.getOperand(3) });
4200 
4201  SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4202  SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4203 
4204  SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4205  return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4206  }
4207 
4208  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4209 
4210  SDValue Lo = Op.getOperand(0);
4211  SDValue Hi = Op.getOperand(1);
4212 
4213  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4214  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4215 
4216  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4217  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4218 
4219  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4220  DAG.getConstant(16, SL, MVT::i32));
4221 
4222  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4223 
4224  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4225 }
4226 
4227 bool
4229  // We can fold offsets for anything that doesn't require a GOT relocation.
4230  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
4233  !shouldEmitGOTReloc(GA->getGlobal());
4234 }
4235 
4236 static SDValue
4238  const SDLoc &DL, unsigned Offset, EVT PtrVT,
4239  unsigned GAFlags = SIInstrInfo::MO_NONE) {
4240  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4241  // lowered to the following code sequence:
4242  //
4243  // For constant address space:
4244  // s_getpc_b64 s[0:1]
4245  // s_add_u32 s0, s0, $symbol
4246  // s_addc_u32 s1, s1, 0
4247  //
4248  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4249  // a fixup or relocation is emitted to replace $symbol with a literal
4250  // constant, which is a pc-relative offset from the encoding of the $symbol
4251  // operand to the global variable.
4252  //
4253  // For global address space:
4254  // s_getpc_b64 s[0:1]
4255  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4256  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4257  //
4258  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4259  // fixups or relocations are emitted to replace $symbol@*@lo and
4260  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4261  // which is a 64-bit pc-relative offset from the encoding of the $symbol
4262  // operand to the global variable.
4263  //
4264  // What we want here is an offset from the value returned by s_getpc
4265  // (which is the address of the s_add_u32 instruction) to the global
4266  // variable, but since the encoding of $symbol starts 4 bytes after the start
4267  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4268  // small. This requires us to add 4 to the global variable offset in order to
4269  // compute the correct address.
4270  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4271  GAFlags);
4272  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4273  GAFlags == SIInstrInfo::MO_NONE ?
4274  GAFlags : GAFlags + 1);
4275  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4276 }
4277 
4278 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4279  SDValue Op,
4280  SelectionDAG &DAG) const {
4281  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4282  const GlobalValue *GV = GSD->getGlobal();
4283 
4284  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
4287  // FIXME: It isn't correct to rely on the type of the pointer. This should
4288  // be removed when address space 0 is 64-bit.
4289  !GV->getType()->getElementType()->isFunctionTy())
4290  return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4291 
4292  SDLoc DL(GSD);
4293  EVT PtrVT = Op.getValueType();
4294 
4295  if (shouldEmitFixup(GV))
4296  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4297  else if (shouldEmitPCReloc(GV))
4298  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4300 
4301  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4303 
4304  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4306  const DataLayout &DataLayout = DAG.getDataLayout();
4307  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4308  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
4309  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
4310 
4311  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4314 }
4315 
4317  const SDLoc &DL, SDValue V) const {
4318  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4319  // the destination register.
4320  //
4321  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4322  // so we will end up with redundant moves to m0.
4323  //
4324  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4325 
4326  // A Null SDValue creates a glue result.
4327  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4328  V, Chain);
4329  return SDValue(M0, 0);
4330 }
4331 
4332 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4333  SDValue Op,
4334  MVT VT,
4335  unsigned Offset) const {
4336  SDLoc SL(Op);
4337  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4338  DAG.getEntryNode(), Offset, 4, false);
4339  // The local size values will have the hi 16-bits as zero.
4340  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4341  DAG.getValueType(VT));
4342 }
4343 
4345  EVT VT) {
4347  "non-hsa intrinsic with hsa target",
4348  DL.getDebugLoc());
4349  DAG.getContext()->diagnose(BadIntrin);
4350  return DAG.getUNDEF(VT);
4351 }
4352 
4354  EVT VT) {
4356  "intrinsic not supported on subtarget",
4357  DL.getDebugLoc());
4358  DAG.getContext()->diagnose(BadIntrin);
4359  return DAG.getUNDEF(VT);
4360 }
4361 
4363  ArrayRef<SDValue> Elts) {
4364  assert(!Elts.empty());
4365  MVT Type;
4366  unsigned NumElts;
4367 
4368  if (Elts.size() == 1) {
4369  Type = MVT::f32;
4370  NumElts = 1;
4371  } else if (Elts.size() == 2) {
4372  Type = MVT::v2f32;
4373  NumElts = 2;
4374  } else if (Elts.size() <= 4) {
4375  Type = MVT::v4f32;
4376  NumElts = 4;
4377  } else if (Elts.size() <= 8) {
4378  Type = MVT::v8f32;
4379  NumElts = 8;
4380  } else {
4381  assert(Elts.size() <= 16);
4382  Type = MVT::v16f32;
4383  NumElts = 16;
4384  }
4385 
4386  SmallVector<SDValue, 16> VecElts(NumElts);
4387  for (unsigned i = 0; i < Elts.size(); ++i) {
4388  SDValue Elt = Elts[i];
4389  if (Elt.getValueType() != MVT::f32)
4390  Elt = DAG.getBitcast(MVT::f32, Elt);
4391  VecElts[i] = Elt;
4392  }
4393  for (unsigned i = Elts.size(); i < NumElts; ++i)
4394  VecElts[i] = DAG.getUNDEF(MVT::f32);
4395 
4396  if (NumElts == 1)
4397  return VecElts[0];
4398  return DAG.getBuildVector(Type, DL, VecElts);
4399 }
4400 
4401 static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4402  SDValue *GLC, SDValue *SLC) {
4403  auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
4404  if (!CachePolicyConst)
4405  return false;
4406 
4407  uint64_t Value = CachePolicyConst->getZExtValue();
4408  SDLoc DL(CachePolicy);
4409  if (GLC) {
4410  *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4411  Value &= ~(uint64_t)0x1;
4412  }
4413  if (SLC) {
4414  *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4415  Value &= ~(uint64_t)0x2;
4416  }
4417 
4418  return Value == 0;
4419 }
4420 
4421 SDValue SITargetLowering::lowerImage(SDValue Op,
4422  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4423  SelectionDAG &DAG) const {
4424  SDLoc DL(Op);
4425  MachineFunction &MF = DAG.getMachineFunction();
4426  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4427  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4428  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4429 
4430  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
4431  bool IsD16 = false;
4432  SDValue VData;
4433  int NumVDataDwords;
4434  unsigned AddrIdx; // Index of first address argument
4435  unsigned DMask;
4436 
4437  if (BaseOpcode->Atomic) {
4438  VData = Op.getOperand(2);
4439 
4440  bool Is64Bit = VData.getValueType() == MVT::i64;
4441  if (BaseOpcode->AtomicX2) {
4442  SDValue VData2 = Op.getOperand(3);
4443  VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4444  {VData, VData2});
4445  if (Is64Bit)
4446  VData = DAG.getBitcast(MVT::v4i32, VData);
4447 
4448  ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4449  DMask = Is64Bit ? 0xf : 0x3;
4450  NumVDataDwords = Is64Bit ? 4 : 2;
4451  AddrIdx = 4;
4452  } else {
4453  DMask = Is64Bit ? 0x3 : 0x1;
4454  NumVDataDwords = Is64Bit ? 2 : 1;
4455  AddrIdx = 3;
4456  }
4457  } else {
4458  unsigned DMaskIdx;
4459 
4460  if (BaseOpcode->Store) {
4461  VData = Op.getOperand(2);
4462 
4463  MVT StoreVT = VData.getSimpleValueType();
4464  if (StoreVT.getScalarType() == MVT::f16) {
4465  if (Subtarget->getGeneration() <