LLVM  8.0.0svn
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #endif
19 
20 #include "SIISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/APFloat.h"
32 #include "llvm/ADT/APInt.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/ADT/Twine.h"
40 #include "llvm/CodeGen/Analysis.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
73 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/KnownBits.h"
79 #include <cassert>
80 #include <cmath>
81 #include <cstdint>
82 #include <iterator>
83 #include <tuple>
84 #include <utility>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-lower"
90 
91 STATISTIC(NumTailCalls, "Number of tail calls");
92 
94  "amdgpu-vgpr-index-mode",
95  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96  cl::init(false));
97 
99  "amdgpu-frame-index-zero-bits",
100  cl::desc("High bits of frame index assumed to be zero"),
101  cl::init(5),
103 
104 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108  return AMDGPU::SGPR0 + Reg;
109  }
110  }
111  llvm_unreachable("Cannot allocate sgpr");
112 }
113 
115  const GCNSubtarget &STI)
116  : AMDGPUTargetLowering(TM, STI),
117  Subtarget(&STI) {
118  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
119  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
120 
121  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
122  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
123 
124  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
125  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
126  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
127 
128  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130 
131  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133 
134  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
135  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
136 
137  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
138  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
139 
140  if (Subtarget->has16BitInsts()) {
141  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
142  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
143 
144  // Unless there are also VOP3P operations, not operations are really legal.
145  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
146  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
147  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
148  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
149  }
150 
152 
153  // We need to custom lower vector stores from local memory
160 
167 
178 
181 
186 
192 
197 
200 
208 
216 
220 
225 
232 
235 
238 
242 
243 #if 0
246 #endif
247 
248  // We only support LOAD/STORE and vector manipulation ops for vectors
249  // with > 4 elements.
252  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
253  switch (Op) {
254  case ISD::LOAD:
255  case ISD::STORE:
256  case ISD::BUILD_VECTOR:
257  case ISD::BITCAST:
263  break;
264  case ISD::CONCAT_VECTORS:
266  break;
267  default:
269  break;
270  }
271  }
272  }
273 
275 
276  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277  // is expanded to avoid having two separate loops in case the index is a VGPR.
278 
279  // Most operations are naturally 32-bit vector operations. We only support
280  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
284 
287 
290 
293  }
294 
299 
302 
303  // Avoid stack access for these.
304  // TODO: Generalize to more vector types.
309 
315 
319 
324 
325  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
326  // and output demarshalling
329 
330  // We can't return success/failure, only the old value,
331  // let LLVM add the comparison
334 
335  if (Subtarget->hasFlatAddressSpace()) {
338  }
339 
342 
343  // On SI this is s_memtime and s_memrealtime on VI.
347 
348  if (Subtarget->has16BitInsts()) {
352  }
353 
354  // v_mad_f32 does not support denormals according to some sources.
355  if (!Subtarget->hasFP32Denormals())
357 
358  if (!Subtarget->hasBFI()) {
359  // fcopysign can be done in a single instruction with BFI.
362  }
363 
364  if (!Subtarget->hasBCNT(32))
366 
367  if (!Subtarget->hasBCNT(64))
369 
370  if (Subtarget->hasFFBH())
372 
373  if (Subtarget->hasFFBL())
375 
376  // We only really have 32-bit BFE instructions (and 16-bit on VI).
377  //
378  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
379  // effort to match them now. We want this to be false for i64 cases when the
380  // extraction isn't restricted to the upper or lower half. Ideally we would
381  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
382  // span the midpoint are probably relatively rare, so don't worry about them
383  // for now.
384  if (Subtarget->hasBFE())
385  setHasExtractBitsInsn(true);
386 
389 
390  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
394  } else {
399  }
400 
402 
407 
408  if (Subtarget->has16BitInsts()) {
410 
413 
416 
419 
422 
427 
430 
436 
438 
440 
442 
444 
449 
454 
455  // F16 - Constant Actions.
457 
458  // F16 - Load/Store Actions.
463 
464  // F16 - VOP1 Actions.
473 
474  // F16 - VOP2 Actions.
480 
481  // F16 - VOP3 Actions.
483  if (!Subtarget->hasFP16Denormals())
485 
486  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
487  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
488  switch (Op) {
489  case ISD::LOAD:
490  case ISD::STORE:
491  case ISD::BUILD_VECTOR:
492  case ISD::BITCAST:
498  break;
499  case ISD::CONCAT_VECTORS:
501  break;
502  default:
504  break;
505  }
506  }
507  }
508 
509  // XXX - Do these do anything? Vector constants turn into build_vector.
512 
515 
520 
525 
532 
537 
542 
547 
551 
552  if (!Subtarget->hasVOP3PInsts()) {
555  }
556 
558  // This isn't really legal, but this avoids the legalizer unrolling it (and
559  // allows matching fneg (fabs x) patterns)
561  }
562 
563  if (Subtarget->hasVOP3PInsts()) {
574 
581 
584 
591 
596 
602 
606  }
607 
610 
611  if (Subtarget->has16BitInsts()) {
616  } else {
617  // Legalization hack.
620 
623  }
624 
627  }
628 
653 
654  // All memory operations. Some folding on the pointer operand is done to help
655  // matching the constant offsets in the addressing modes.
673 
675 
676  // SI at least has hardware support for floating point exceptions, but no way
677  // of using or handling them is implemented. They are also optional in OpenCL
678  // (Section 7.3)
680 }
681 
683  return Subtarget;
684 }
685 
686 //===----------------------------------------------------------------------===//
687 // TargetLowering queries
688 //===----------------------------------------------------------------------===//
689 
690 // v_mad_mix* support a conversion from f16 to f32.
691 //
692 // There is only one special case when denormals are enabled we don't currently,
693 // where this is OK to use.
695  EVT DestVT, EVT SrcVT) const {
696  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
697  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
698  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
699  SrcVT.getScalarType() == MVT::f16;
700 }
701 
703  // SI has some legal vector types, but no legal vector operations. Say no
704  // shuffles are legal in order to prefer scalarizing some vector operations.
705  return false;
706 }
707 
709  CallingConv::ID CC,
710  EVT VT) const {
711  // TODO: Consider splitting all arguments into 32-bit pieces.
712  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
713  EVT ScalarVT = VT.getScalarType();
714  unsigned Size = ScalarVT.getSizeInBits();
715  if (Size == 32)
716  return ScalarVT.getSimpleVT();
717 
718  if (Size == 64)
719  return MVT::i32;
720 
721  if (Size == 16 && Subtarget->has16BitInsts())
722  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
723  }
724 
725  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
726 }
727 
729  CallingConv::ID CC,
730  EVT VT) const {
731  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
732  unsigned NumElts = VT.getVectorNumElements();
733  EVT ScalarVT = VT.getScalarType();
734  unsigned Size = ScalarVT.getSizeInBits();
735 
736  if (Size == 32)
737  return NumElts;
738 
739  if (Size == 64)
740  return 2 * NumElts;
741 
742  if (Size == 16 && Subtarget->has16BitInsts())
743  return (VT.getVectorNumElements() + 1) / 2;
744  }
745 
746  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
747 }
748 
751  EVT VT, EVT &IntermediateVT,
752  unsigned &NumIntermediates, MVT &RegisterVT) const {
753  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
754  unsigned NumElts = VT.getVectorNumElements();
755  EVT ScalarVT = VT.getScalarType();
756  unsigned Size = ScalarVT.getSizeInBits();
757  if (Size == 32) {
758  RegisterVT = ScalarVT.getSimpleVT();
759  IntermediateVT = RegisterVT;
760  NumIntermediates = NumElts;
761  return NumIntermediates;
762  }
763 
764  if (Size == 64) {
765  RegisterVT = MVT::i32;
766  IntermediateVT = RegisterVT;
767  NumIntermediates = 2 * NumElts;
768  return NumIntermediates;
769  }
770 
771  // FIXME: We should fix the ABI to be the same on targets without 16-bit
772  // support, but unless we can properly handle 3-vectors, it will be still be
773  // inconsistent.
774  if (Size == 16 && Subtarget->has16BitInsts()) {
775  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
776  IntermediateVT = RegisterVT;
777  NumIntermediates = (NumElts + 1) / 2;
778  return NumIntermediates;
779  }
780  }
781 
783  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
784 }
785 
787  const CallInst &CI,
788  MachineFunction &MF,
789  unsigned IntrID) const {
790  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
791  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
793  (Intrinsic::ID)IntrID);
794  if (Attr.hasFnAttribute(Attribute::ReadNone))
795  return false;
796 
798 
799  if (RsrcIntr->IsImage) {
800  Info.ptrVal = MFI->getImagePSV(
802  CI.getArgOperand(RsrcIntr->RsrcArg));
803  Info.align = 0;
804  } else {
805  Info.ptrVal = MFI->getBufferPSV(
807  CI.getArgOperand(RsrcIntr->RsrcArg));
808  }
809 
811  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
813  Info.memVT = MVT::getVT(CI.getType());
815  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
816  Info.opc = ISD::INTRINSIC_VOID;
817  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
819  } else {
820  // Atomic
822  Info.memVT = MVT::getVT(CI.getType());
826 
827  // XXX - Should this be volatile without known ordering?
829  }
830  return true;
831  }
832 
833  switch (IntrID) {
834  case Intrinsic::amdgcn_atomic_inc:
835  case Intrinsic::amdgcn_atomic_dec:
836  case Intrinsic::amdgcn_ds_fadd:
837  case Intrinsic::amdgcn_ds_fmin:
838  case Intrinsic::amdgcn_ds_fmax: {
840  Info.memVT = MVT::getVT(CI.getType());
841  Info.ptrVal = CI.getOperand(0);
842  Info.align = 0;
844 
845  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
846  if (!Vol || !Vol->isZero())
848 
849  return true;
850  }
851 
852  default:
853  return false;
854  }
855 }
856 
859  Type *&AccessTy) const {
860  switch (II->getIntrinsicID()) {
861  case Intrinsic::amdgcn_atomic_inc:
862  case Intrinsic::amdgcn_atomic_dec:
863  case Intrinsic::amdgcn_ds_fadd:
864  case Intrinsic::amdgcn_ds_fmin:
865  case Intrinsic::amdgcn_ds_fmax: {
866  Value *Ptr = II->getArgOperand(0);
867  AccessTy = II->getType();
868  Ops.push_back(Ptr);
869  return true;
870  }
871  default:
872  return false;
873  }
874 }
875 
876 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
877  if (!Subtarget->hasFlatInstOffsets()) {
878  // Flat instructions do not have offsets, and only have the register
879  // address.
880  return AM.BaseOffs == 0 && AM.Scale == 0;
881  }
882 
883  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
884  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
885 
886  // Just r + i
887  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
888 }
889 
890 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
891  if (Subtarget->hasFlatGlobalInsts())
892  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
893 
894  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
895  // Assume the we will use FLAT for all global memory accesses
896  // on VI.
897  // FIXME: This assumption is currently wrong. On VI we still use
898  // MUBUF instructions for the r + i addressing mode. As currently
899  // implemented, the MUBUF instructions only work on buffer < 4GB.
900  // It may be possible to support > 4GB buffers with MUBUF instructions,
901  // by setting the stride value in the resource descriptor which would
902  // increase the size limit to (stride * 4GB). However, this is risky,
903  // because it has never been validated.
904  return isLegalFlatAddressingMode(AM);
905  }
906 
907  return isLegalMUBUFAddressingMode(AM);
908 }
909 
910 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
911  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
912  // additionally can do r + r + i with addr64. 32-bit has more addressing
913  // mode options. Depending on the resource constant, it can also do
914  // (i64 r0) + (i32 r1) * (i14 i).
915  //
916  // Private arrays end up using a scratch buffer most of the time, so also
917  // assume those use MUBUF instructions. Scratch loads / stores are currently
918  // implemented as mubuf instructions with offen bit set, so slightly
919  // different than the normal addr64.
920  if (!isUInt<12>(AM.BaseOffs))
921  return false;
922 
923  // FIXME: Since we can split immediate into soffset and immediate offset,
924  // would it make sense to allow any immediate?
925 
926  switch (AM.Scale) {
927  case 0: // r + i or just i, depending on HasBaseReg.
928  return true;
929  case 1:
930  return true; // We have r + r or r + i.
931  case 2:
932  if (AM.HasBaseReg) {
933  // Reject 2 * r + r.
934  return false;
935  }
936 
937  // Allow 2 * r as r + r
938  // Or 2 * r + i is allowed as r + r + i.
939  return true;
940  default: // Don't allow n * r
941  return false;
942  }
943 }
944 
946  const AddrMode &AM, Type *Ty,
947  unsigned AS, Instruction *I) const {
948  // No global is ever allowed as a base.
949  if (AM.BaseGV)
950  return false;
951 
952  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
953  return isLegalGlobalAddressingMode(AM);
954 
955  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
957  // If the offset isn't a multiple of 4, it probably isn't going to be
958  // correctly aligned.
959  // FIXME: Can we get the real alignment here?
960  if (AM.BaseOffs % 4 != 0)
961  return isLegalMUBUFAddressingMode(AM);
962 
963  // There are no SMRD extloads, so if we have to do a small type access we
964  // will use a MUBUF load.
965  // FIXME?: We also need to do this if unaligned, but we don't know the
966  // alignment here.
967  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
968  return isLegalGlobalAddressingMode(AM);
969 
970  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
971  // SMRD instructions have an 8-bit, dword offset on SI.
972  if (!isUInt<8>(AM.BaseOffs / 4))
973  return false;
974  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
975  // On CI+, this can also be a 32-bit literal constant offset. If it fits
976  // in 8-bits, it can use a smaller encoding.
977  if (!isUInt<32>(AM.BaseOffs / 4))
978  return false;
979  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
980  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
981  if (!isUInt<20>(AM.BaseOffs))
982  return false;
983  } else
984  llvm_unreachable("unhandled generation");
985 
986  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
987  return true;
988 
989  if (AM.Scale == 1 && AM.HasBaseReg)
990  return true;
991 
992  return false;
993 
994  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
995  return isLegalMUBUFAddressingMode(AM);
996  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
997  AS == AMDGPUAS::REGION_ADDRESS) {
998  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
999  // field.
1000  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1001  // an 8-bit dword offset but we don't know the alignment here.
1002  if (!isUInt<16>(AM.BaseOffs))
1003  return false;
1004 
1005  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1006  return true;
1007 
1008  if (AM.Scale == 1 && AM.HasBaseReg)
1009  return true;
1010 
1011  return false;
1012  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1014  // For an unknown address space, this usually means that this is for some
1015  // reason being used for pure arithmetic, and not based on some addressing
1016  // computation. We don't have instructions that compute pointers with any
1017  // addressing modes, so treat them as having no offset like flat
1018  // instructions.
1019  return isLegalFlatAddressingMode(AM);
1020  } else {
1021  llvm_unreachable("unhandled address space");
1022  }
1023 }
1024 
1025 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1026  const SelectionDAG &DAG) const {
1027  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1028  return (MemVT.getSizeInBits() <= 4 * 32);
1029  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1030  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1031  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1032  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1033  return (MemVT.getSizeInBits() <= 2 * 32);
1034  }
1035  return true;
1036 }
1037 
1039  unsigned AddrSpace,
1040  unsigned Align,
1041  bool *IsFast) const {
1042  if (IsFast)
1043  *IsFast = false;
1044 
1045  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1046  // which isn't a simple VT.
1047  // Until MVT is extended to handle this, simply check for the size and
1048  // rely on the condition below: allow accesses if the size is a multiple of 4.
1049  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1050  VT.getStoreSize() > 16)) {
1051  return false;
1052  }
1053 
1054  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1055  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1056  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1057  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1058  // with adjacent offsets.
1059  bool AlignedBy4 = (Align % 4 == 0);
1060  if (IsFast)
1061  *IsFast = AlignedBy4;
1062 
1063  return AlignedBy4;
1064  }
1065 
1066  // FIXME: We have to be conservative here and assume that flat operations
1067  // will access scratch. If we had access to the IR function, then we
1068  // could determine if any private memory was used in the function.
1069  if (!Subtarget->hasUnalignedScratchAccess() &&
1070  (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1071  AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1072  return false;
1073  }
1074 
1075  if (Subtarget->hasUnalignedBufferAccess()) {
1076  // If we have an uniform constant load, it still requires using a slow
1077  // buffer instruction if unaligned.
1078  if (IsFast) {
1079  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1080  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1081  (Align % 4 == 0) : true;
1082  }
1083 
1084  return true;
1085  }
1086 
1087  // Smaller than dword value must be aligned.
1088  if (VT.bitsLT(MVT::i32))
1089  return false;
1090 
1091  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1092  // byte-address are ignored, thus forcing Dword alignment.
1093  // This applies to private, global, and constant memory.
1094  if (IsFast)
1095  *IsFast = true;
1096 
1097  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1098 }
1099 
1100 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1101  unsigned SrcAlign, bool IsMemset,
1102  bool ZeroMemset,
1103  bool MemcpyStrSrc,
1104  MachineFunction &MF) const {
1105  // FIXME: Should account for address space here.
1106 
1107  // The default fallback uses the private pointer size as a guess for a type to
1108  // use. Make sure we switch these to 64-bit accesses.
1109 
1110  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1111  return MVT::v4i32;
1112 
1113  if (Size >= 8 && DstAlign >= 4)
1114  return MVT::v2i32;
1115 
1116  // Use the default.
1117  return MVT::Other;
1118 }
1119 
1120 static bool isFlatGlobalAddrSpace(unsigned AS) {
1121  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1122  AS == AMDGPUAS::FLAT_ADDRESS ||
1124 }
1125 
1127  unsigned DestAS) const {
1128  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1129 }
1130 
1132  const MemSDNode *MemNode = cast<MemSDNode>(N);
1133  const Value *Ptr = MemNode->getMemOperand()->getValue();
1134  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1135  return I && I->getMetadata("amdgpu.noclobber");
1136 }
1137 
1139  unsigned DestAS) const {
1140  // Flat -> private/local is a simple truncate.
1141  // Flat -> global is no-op
1142  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1143  return true;
1144 
1145  return isNoopAddrSpaceCast(SrcAS, DestAS);
1146 }
1147 
1149  const MemSDNode *MemNode = cast<MemSDNode>(N);
1150 
1151  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1152 }
1153 
1156  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1157  return TypeSplitVector;
1158 
1160 }
1161 
1163  Type *Ty) const {
1164  // FIXME: Could be smarter if called for vector constants.
1165  return true;
1166 }
1167 
1169  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1170  switch (Op) {
1171  case ISD::LOAD:
1172  case ISD::STORE:
1173 
1174  // These operations are done with 32-bit instructions anyway.
1175  case ISD::AND:
1176  case ISD::OR:
1177  case ISD::XOR:
1178  case ISD::SELECT:
1179  // TODO: Extensions?
1180  return true;
1181  default:
1182  return false;
1183  }
1184  }
1185 
1186  // SimplifySetCC uses this function to determine whether or not it should
1187  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1188  if (VT == MVT::i1 && Op == ISD::SETCC)
1189  return false;
1190 
1191  return TargetLowering::isTypeDesirableForOp(Op, VT);
1192 }
1193 
1194 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1195  const SDLoc &SL,
1196  SDValue Chain,
1197  uint64_t Offset) const {
1198  const DataLayout &DL = DAG.getDataLayout();
1199  MachineFunction &MF = DAG.getMachineFunction();
1201 
1202  const ArgDescriptor *InputPtrReg;
1203  const TargetRegisterClass *RC;
1204 
1205  std::tie(InputPtrReg, RC)
1207 
1210  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1211  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1212 
1213  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1214 }
1215 
1216 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1217  const SDLoc &SL) const {
1218  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1219  FIRST_IMPLICIT);
1220  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1221 }
1222 
1223 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1224  const SDLoc &SL, SDValue Val,
1225  bool Signed,
1226  const ISD::InputArg *Arg) const {
1227  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1228  VT.bitsLT(MemVT)) {
1229  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1230  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1231  }
1232 
1233  if (MemVT.isFloatingPoint())
1234  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1235  else if (Signed)
1236  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1237  else
1238  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1239 
1240  return Val;
1241 }
1242 
1243 SDValue SITargetLowering::lowerKernargMemParameter(
1244  SelectionDAG &DAG, EVT VT, EVT MemVT,
1245  const SDLoc &SL, SDValue Chain,
1246  uint64_t Offset, unsigned Align, bool Signed,
1247  const ISD::InputArg *Arg) const {
1248  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1250  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1251 
1252  // Try to avoid using an extload by loading earlier than the argument address,
1253  // and extracting the relevant bits. The load should hopefully be merged with
1254  // the previous argument.
1255  if (MemVT.getStoreSize() < 4 && Align < 4) {
1256  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1257  int64_t AlignDownOffset = alignDown(Offset, 4);
1258  int64_t OffsetDiff = Offset - AlignDownOffset;
1259 
1260  EVT IntVT = MemVT.changeTypeToInteger();
1261 
1262  // TODO: If we passed in the base kernel offset we could have a better
1263  // alignment than 4, but we don't really need it.
1264  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1265  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1268 
1269  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1270  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1271 
1272  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1273  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1274  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1275 
1276 
1277  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1278  }
1279 
1280  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1281  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1284 
1285  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1286  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1287 }
1288 
1289 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1290  const SDLoc &SL, SDValue Chain,
1291  const ISD::InputArg &Arg) const {
1292  MachineFunction &MF = DAG.getMachineFunction();
1293  MachineFrameInfo &MFI = MF.getFrameInfo();
1294 
1295  if (Arg.Flags.isByVal()) {
1296  unsigned Size = Arg.Flags.getByValSize();
1297  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1298  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1299  }
1300 
1301  unsigned ArgOffset = VA.getLocMemOffset();
1302  unsigned ArgSize = VA.getValVT().getStoreSize();
1303 
1304  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1305 
1306  // Create load nodes to retrieve arguments from the stack.
1307  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1308  SDValue ArgValue;
1309 
1310  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1312  MVT MemVT = VA.getValVT();
1313 
1314  switch (VA.getLocInfo()) {
1315  default:
1316  break;
1317  case CCValAssign::BCvt:
1318  MemVT = VA.getLocVT();
1319  break;
1320  case CCValAssign::SExt:
1321  ExtType = ISD::SEXTLOAD;
1322  break;
1323  case CCValAssign::ZExt:
1324  ExtType = ISD::ZEXTLOAD;
1325  break;
1326  case CCValAssign::AExt:
1327  ExtType = ISD::EXTLOAD;
1328  break;
1329  }
1330 
1331  ArgValue = DAG.getExtLoad(
1332  ExtType, SL, VA.getLocVT(), Chain, FIN,
1334  MemVT);
1335  return ArgValue;
1336 }
1337 
1338 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1339  const SIMachineFunctionInfo &MFI,
1340  EVT VT,
1342  const ArgDescriptor *Reg;
1343  const TargetRegisterClass *RC;
1344 
1345  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1346  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1347 }
1348 
1350  CallingConv::ID CallConv,
1352  BitVector &Skipped,
1353  FunctionType *FType,
1354  SIMachineFunctionInfo *Info) {
1355  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1356  const ISD::InputArg *Arg = &Ins[I];
1357 
1358  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1359  "vector type argument should have been split");
1360 
1361  // First check if it's a PS input addr.
1362  if (CallConv == CallingConv::AMDGPU_PS &&
1363  !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1364 
1365  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1366 
1367  // Inconveniently only the first part of the split is marked as isSplit,
1368  // so skip to the end. We only want to increment PSInputNum once for the
1369  // entire split argument.
1370  if (Arg->Flags.isSplit()) {
1371  while (!Arg->Flags.isSplitEnd()) {
1372  assert(!Arg->VT.isVector() &&
1373  "unexpected vector split in ps argument type");
1374  if (!SkipArg)
1375  Splits.push_back(*Arg);
1376  Arg = &Ins[++I];
1377  }
1378  }
1379 
1380  if (SkipArg) {
1381  // We can safely skip PS inputs.
1382  Skipped.set(Arg->getOrigArgIndex());
1383  ++PSInputNum;
1384  continue;
1385  }
1386 
1387  Info->markPSInputAllocated(PSInputNum);
1388  if (Arg->Used)
1389  Info->markPSInputEnabled(PSInputNum);
1390 
1391  ++PSInputNum;
1392  }
1393 
1394  Splits.push_back(*Arg);
1395  }
1396 }
1397 
1398 // Allocate special inputs passed in VGPRs.
1400  MachineFunction &MF,
1401  const SIRegisterInfo &TRI,
1402  SIMachineFunctionInfo &Info) {
1403  if (Info.hasWorkItemIDX()) {
1404  unsigned Reg = AMDGPU::VGPR0;
1405  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1406 
1407  CCInfo.AllocateReg(Reg);
1409  }
1410 
1411  if (Info.hasWorkItemIDY()) {
1412  unsigned Reg = AMDGPU::VGPR1;
1413  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1414 
1415  CCInfo.AllocateReg(Reg);
1417  }
1418 
1419  if (Info.hasWorkItemIDZ()) {
1420  unsigned Reg = AMDGPU::VGPR2;
1421  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1422 
1423  CCInfo.AllocateReg(Reg);
1425  }
1426 }
1427 
1428 // Try to allocate a VGPR at the end of the argument list, or if no argument
1429 // VGPRs are left allocating a stack slot.
1431  ArrayRef<MCPhysReg> ArgVGPRs
1432  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1433  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1434  if (RegIdx == ArgVGPRs.size()) {
1435  // Spill to stack required.
1436  int64_t Offset = CCInfo.AllocateStack(4, 4);
1437 
1438  return ArgDescriptor::createStack(Offset);
1439  }
1440 
1441  unsigned Reg = ArgVGPRs[RegIdx];
1442  Reg = CCInfo.AllocateReg(Reg);
1443  assert(Reg != AMDGPU::NoRegister);
1444 
1445  MachineFunction &MF = CCInfo.getMachineFunction();
1446  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1447  return ArgDescriptor::createRegister(Reg);
1448 }
1449 
1451  const TargetRegisterClass *RC,
1452  unsigned NumArgRegs) {
1453  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1454  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1455  if (RegIdx == ArgSGPRs.size())
1456  report_fatal_error("ran out of SGPRs for arguments");
1457 
1458  unsigned Reg = ArgSGPRs[RegIdx];
1459  Reg = CCInfo.AllocateReg(Reg);
1460  assert(Reg != AMDGPU::NoRegister);
1461 
1462  MachineFunction &MF = CCInfo.getMachineFunction();
1463  MF.addLiveIn(Reg, RC);
1464  return ArgDescriptor::createRegister(Reg);
1465 }
1466 
1468  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1469 }
1470 
1472  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1473 }
1474 
1476  MachineFunction &MF,
1477  const SIRegisterInfo &TRI,
1478  SIMachineFunctionInfo &Info) {
1479  if (Info.hasWorkItemIDX())
1480  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1481 
1482  if (Info.hasWorkItemIDY())
1483  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1484 
1485  if (Info.hasWorkItemIDZ())
1486  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1487 }
1488 
1490  MachineFunction &MF,
1491  const SIRegisterInfo &TRI,
1492  SIMachineFunctionInfo &Info) {
1493  auto &ArgInfo = Info.getArgInfo();
1494 
1495  // TODO: Unify handling with private memory pointers.
1496 
1497  if (Info.hasDispatchPtr())
1498  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1499 
1500  if (Info.hasQueuePtr())
1501  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1502 
1503  if (Info.hasKernargSegmentPtr())
1504  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1505 
1506  if (Info.hasDispatchID())
1507  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1508 
1509  // flat_scratch_init is not applicable for non-kernel functions.
1510 
1511  if (Info.hasWorkGroupIDX())
1512  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1513 
1514  if (Info.hasWorkGroupIDY())
1515  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1516 
1517  if (Info.hasWorkGroupIDZ())
1518  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1519 
1520  if (Info.hasImplicitArgPtr())
1521  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1522 }
1523 
1524 // Allocate special inputs passed in user SGPRs.
1525 static void allocateHSAUserSGPRs(CCState &CCInfo,
1526  MachineFunction &MF,
1527  const SIRegisterInfo &TRI,
1528  SIMachineFunctionInfo &Info) {
1529  if (Info.hasImplicitBufferPtr()) {
1530  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1531  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1532  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1533  }
1534 
1535  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1536  if (Info.hasPrivateSegmentBuffer()) {
1537  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1538  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1539  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1540  }
1541 
1542  if (Info.hasDispatchPtr()) {
1543  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1544  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1545  CCInfo.AllocateReg(DispatchPtrReg);
1546  }
1547 
1548  if (Info.hasQueuePtr()) {
1549  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1550  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1551  CCInfo.AllocateReg(QueuePtrReg);
1552  }
1553 
1554  if (Info.hasKernargSegmentPtr()) {
1555  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1556  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1557  CCInfo.AllocateReg(InputPtrReg);
1558  }
1559 
1560  if (Info.hasDispatchID()) {
1561  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1562  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1563  CCInfo.AllocateReg(DispatchIDReg);
1564  }
1565 
1566  if (Info.hasFlatScratchInit()) {
1567  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1568  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1569  CCInfo.AllocateReg(FlatScratchInitReg);
1570  }
1571 
1572  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1573  // these from the dispatch pointer.
1574 }
1575 
1576 // Allocate special input registers that are initialized per-wave.
1577 static void allocateSystemSGPRs(CCState &CCInfo,
1578  MachineFunction &MF,
1579  SIMachineFunctionInfo &Info,
1580  CallingConv::ID CallConv,
1581  bool IsShader) {
1582  if (Info.hasWorkGroupIDX()) {
1583  unsigned Reg = Info.addWorkGroupIDX();
1584  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1585  CCInfo.AllocateReg(Reg);
1586  }
1587 
1588  if (Info.hasWorkGroupIDY()) {
1589  unsigned Reg = Info.addWorkGroupIDY();
1590  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1591  CCInfo.AllocateReg(Reg);
1592  }
1593 
1594  if (Info.hasWorkGroupIDZ()) {
1595  unsigned Reg = Info.addWorkGroupIDZ();
1596  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1597  CCInfo.AllocateReg(Reg);
1598  }
1599 
1600  if (Info.hasWorkGroupInfo()) {
1601  unsigned Reg = Info.addWorkGroupInfo();
1602  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1603  CCInfo.AllocateReg(Reg);
1604  }
1605 
1606  if (Info.hasPrivateSegmentWaveByteOffset()) {
1607  // Scratch wave offset passed in system SGPR.
1608  unsigned PrivateSegmentWaveByteOffsetReg;
1609 
1610  if (IsShader) {
1611  PrivateSegmentWaveByteOffsetReg =
1613 
1614  // This is true if the scratch wave byte offset doesn't have a fixed
1615  // location.
1616  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1617  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1618  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1619  }
1620  } else
1621  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1622 
1623  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1624  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1625  }
1626 }
1627 
1629  MachineFunction &MF,
1630  const SIRegisterInfo &TRI,
1631  SIMachineFunctionInfo &Info) {
1632  // Now that we've figured out where the scratch register inputs are, see if
1633  // should reserve the arguments and use them directly.
1634  MachineFrameInfo &MFI = MF.getFrameInfo();
1635  bool HasStackObjects = MFI.hasStackObjects();
1636 
1637  // Record that we know we have non-spill stack objects so we don't need to
1638  // check all stack objects later.
1639  if (HasStackObjects)
1640  Info.setHasNonSpillStackObjects(true);
1641 
1642  // Everything live out of a block is spilled with fast regalloc, so it's
1643  // almost certain that spilling will be required.
1644  if (TM.getOptLevel() == CodeGenOpt::None)
1645  HasStackObjects = true;
1646 
1647  // For now assume stack access is needed in any callee functions, so we need
1648  // the scratch registers to pass in.
1649  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1650 
1651  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1652  if (ST.isAmdCodeObjectV2(MF.getFunction())) {
1653  if (RequiresStackAccess) {
1654  // If we have stack objects, we unquestionably need the private buffer
1655  // resource. For the Code Object V2 ABI, this will be the first 4 user
1656  // SGPR inputs. We can reserve those and use them directly.
1657 
1658  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1660  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1661 
1662  if (MFI.hasCalls()) {
1663  // If we have calls, we need to keep the frame register in a register
1664  // that won't be clobbered by a call, so ensure it is copied somewhere.
1665 
1666  // This is not a problem for the scratch wave offset, because the same
1667  // registers are reserved in all functions.
1668 
1669  // FIXME: Nothing is really ensuring this is a call preserved register,
1670  // it's just selected from the end so it happens to be.
1671  unsigned ReservedOffsetReg
1673  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1674  } else {
1675  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1677  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1678  }
1679  } else {
1680  unsigned ReservedBufferReg
1682  unsigned ReservedOffsetReg
1684 
1685  // We tentatively reserve the last registers (skipping the last two
1686  // which may contain VCC). After register allocation, we'll replace
1687  // these with the ones immediately after those which were really
1688  // allocated. In the prologue copies will be inserted from the argument
1689  // to these reserved registers.
1690  Info.setScratchRSrcReg(ReservedBufferReg);
1691  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1692  }
1693  } else {
1694  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1695 
1696  // Without HSA, relocations are used for the scratch pointer and the
1697  // buffer resource setup is always inserted in the prologue. Scratch wave
1698  // offset is still in an input SGPR.
1699  Info.setScratchRSrcReg(ReservedBufferReg);
1700 
1701  if (HasStackObjects && !MFI.hasCalls()) {
1702  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1704  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1705  } else {
1706  unsigned ReservedOffsetReg
1708  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1709  }
1710  }
1711 }
1712 
1715  return !Info->isEntryFunction();
1716 }
1717 
1719 
1720 }
1721 
1723  MachineBasicBlock *Entry,
1724  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1726 
1727  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1728  if (!IStart)
1729  return;
1730 
1731  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1732  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1733  MachineBasicBlock::iterator MBBI = Entry->begin();
1734  for (const MCPhysReg *I = IStart; *I; ++I) {
1735  const TargetRegisterClass *RC = nullptr;
1736  if (AMDGPU::SReg_64RegClass.contains(*I))
1737  RC = &AMDGPU::SGPR_64RegClass;
1738  else if (AMDGPU::SReg_32RegClass.contains(*I))
1739  RC = &AMDGPU::SGPR_32RegClass;
1740  else
1741  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1742 
1743  unsigned NewVR = MRI->createVirtualRegister(RC);
1744  // Create copy from CSR to a virtual register.
1745  Entry->addLiveIn(*I);
1746  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1747  .addReg(*I);
1748 
1749  // Insert the copy-back instructions right before the terminator.
1750  for (auto *Exit : Exits)
1751  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1752  TII->get(TargetOpcode::COPY), *I)
1753  .addReg(NewVR);
1754  }
1755 }
1756 
1758  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1759  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1760  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1762 
1763  MachineFunction &MF = DAG.getMachineFunction();
1764  const Function &Fn = MF.getFunction();
1765  FunctionType *FType = MF.getFunction().getFunctionType();
1767  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1768 
1769  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1770  DiagnosticInfoUnsupported NoGraphicsHSA(
1771  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1772  DAG.getContext()->diagnose(NoGraphicsHSA);
1773  return DAG.getEntryNode();
1774  }
1775 
1776  // Create stack objects that are used for emitting debugger prologue if
1777  // "amdgpu-debugger-emit-prologue" attribute was specified.
1778  if (ST.debuggerEmitPrologue())
1779  createDebuggerPrologueStackObjects(MF);
1780 
1783  BitVector Skipped(Ins.size());
1784  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1785  *DAG.getContext());
1786 
1787  bool IsShader = AMDGPU::isShader(CallConv);
1788  bool IsKernel = AMDGPU::isKernel(CallConv);
1789  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1790 
1791  if (!IsEntryFunc) {
1792  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1793  // this when allocating argument fixed offsets.
1794  CCInfo.AllocateStack(4, 4);
1795  }
1796 
1797  if (IsShader) {
1798  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1799 
1800  // At least one interpolation mode must be enabled or else the GPU will
1801  // hang.
1802  //
1803  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1804  // set PSInputAddr, the user wants to enable some bits after the compilation
1805  // based on run-time states. Since we can't know what the final PSInputEna
1806  // will look like, so we shouldn't do anything here and the user should take
1807  // responsibility for the correct programming.
1808  //
1809  // Otherwise, the following restrictions apply:
1810  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1811  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1812  // enabled too.
1813  if (CallConv == CallingConv::AMDGPU_PS) {
1814  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1815  ((Info->getPSInputAddr() & 0xF) == 0 &&
1816  Info->isPSInputAllocated(11))) {
1817  CCInfo.AllocateReg(AMDGPU::VGPR0);
1818  CCInfo.AllocateReg(AMDGPU::VGPR1);
1819  Info->markPSInputAllocated(0);
1820  Info->markPSInputEnabled(0);
1821  }
1822  if (Subtarget->isAmdPalOS()) {
1823  // For isAmdPalOS, the user does not enable some bits after compilation
1824  // based on run-time states; the register values being generated here are
1825  // the final ones set in hardware. Therefore we need to apply the
1826  // workaround to PSInputAddr and PSInputEnable together. (The case where
1827  // a bit is set in PSInputAddr but not PSInputEnable is where the
1828  // frontend set up an input arg for a particular interpolation mode, but
1829  // nothing uses that input arg. Really we should have an earlier pass
1830  // that removes such an arg.)
1831  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1832  if ((PsInputBits & 0x7F) == 0 ||
1833  ((PsInputBits & 0xF) == 0 &&
1834  (PsInputBits >> 11 & 1)))
1835  Info->markPSInputEnabled(
1837  }
1838  }
1839 
1840  assert(!Info->hasDispatchPtr() &&
1841  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1842  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1843  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1844  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1845  !Info->hasWorkItemIDZ());
1846  } else if (IsKernel) {
1847  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1848  } else {
1849  Splits.append(Ins.begin(), Ins.end());
1850  }
1851 
1852  if (IsEntryFunc) {
1853  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1854  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1855  }
1856 
1857  if (IsKernel) {
1858  analyzeFormalArgumentsCompute(CCInfo, Ins);
1859  } else {
1860  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1861  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1862  }
1863 
1864  SmallVector<SDValue, 16> Chains;
1865 
1866  // FIXME: This is the minimum kernel argument alignment. We should improve
1867  // this to the maximum alignment of the arguments.
1868  //
1869  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1870  // kern arg offset.
1871  const unsigned KernelArgBaseAlign = 16;
1872 
1873  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1874  const ISD::InputArg &Arg = Ins[i];
1875  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1876  InVals.push_back(DAG.getUNDEF(Arg.VT));
1877  continue;
1878  }
1879 
1880  CCValAssign &VA = ArgLocs[ArgIdx++];
1881  MVT VT = VA.getLocVT();
1882 
1883  if (IsEntryFunc && VA.isMemLoc()) {
1884  VT = Ins[i].VT;
1885  EVT MemVT = VA.getLocVT();
1886 
1887  const uint64_t Offset = VA.getLocMemOffset();
1888  unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1889 
1890  SDValue Arg = lowerKernargMemParameter(
1891  DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1892  Chains.push_back(Arg.getValue(1));
1893 
1894  auto *ParamTy =
1895  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1896  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1897  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1898  // On SI local pointers are just offsets into LDS, so they are always
1899  // less than 16-bits. On CI and newer they could potentially be
1900  // real pointers, so we can't guarantee their size.
1901  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1902  DAG.getValueType(MVT::i16));
1903  }
1904 
1905  InVals.push_back(Arg);
1906  continue;
1907  } else if (!IsEntryFunc && VA.isMemLoc()) {
1908  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1909  InVals.push_back(Val);
1910  if (!Arg.Flags.isByVal())
1911  Chains.push_back(Val.getValue(1));
1912  continue;
1913  }
1914 
1915  assert(VA.isRegLoc() && "Parameter must be in a register!");
1916 
1917  unsigned Reg = VA.getLocReg();
1918  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1919  EVT ValVT = VA.getValVT();
1920 
1921  Reg = MF.addLiveIn(Reg, RC);
1922  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1923 
1924  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1925  // The return object should be reasonably addressable.
1926 
1927  // FIXME: This helps when the return is a real sret. If it is a
1928  // automatically inserted sret (i.e. CanLowerReturn returns false), an
1929  // extra copy is inserted in SelectionDAGBuilder which obscures this.
1930  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1931  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1932  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1933  }
1934 
1935  // If this is an 8 or 16-bit value, it is really passed promoted
1936  // to 32 bits. Insert an assert[sz]ext to capture this, then
1937  // truncate to the right size.
1938  switch (VA.getLocInfo()) {
1939  case CCValAssign::Full:
1940  break;
1941  case CCValAssign::BCvt:
1942  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1943  break;
1944  case CCValAssign::SExt:
1945  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1946  DAG.getValueType(ValVT));
1947  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1948  break;
1949  case CCValAssign::ZExt:
1950  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1951  DAG.getValueType(ValVT));
1952  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1953  break;
1954  case CCValAssign::AExt:
1955  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1956  break;
1957  default:
1958  llvm_unreachable("Unknown loc info!");
1959  }
1960 
1961  InVals.push_back(Val);
1962  }
1963 
1964  if (!IsEntryFunc) {
1965  // Special inputs come after user arguments.
1966  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1967  }
1968 
1969  // Start adding system SGPRs.
1970  if (IsEntryFunc) {
1971  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1972  } else {
1973  CCInfo.AllocateReg(Info->getScratchRSrcReg());
1974  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1975  CCInfo.AllocateReg(Info->getFrameOffsetReg());
1976  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1977  }
1978 
1979  auto &ArgUsageInfo =
1981  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
1982 
1983  unsigned StackArgSize = CCInfo.getNextStackOffset();
1984  Info->setBytesInStackArgArea(StackArgSize);
1985 
1986  return Chains.empty() ? Chain :
1987  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1988 }
1989 
1990 // TODO: If return values can't fit in registers, we should return as many as
1991 // possible in registers before passing on stack.
1993  CallingConv::ID CallConv,
1994  MachineFunction &MF, bool IsVarArg,
1995  const SmallVectorImpl<ISD::OutputArg> &Outs,
1996  LLVMContext &Context) const {
1997  // Replacing returns with sret/stack usage doesn't make sense for shaders.
1998  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1999  // for shaders. Vector types should be explicitly handled by CC.
2000  if (AMDGPU::isEntryFunctionCC(CallConv))
2001  return true;
2002 
2004  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2005  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2006 }
2007 
2008 SDValue
2010  bool isVarArg,
2011  const SmallVectorImpl<ISD::OutputArg> &Outs,
2012  const SmallVectorImpl<SDValue> &OutVals,
2013  const SDLoc &DL, SelectionDAG &DAG) const {
2014  MachineFunction &MF = DAG.getMachineFunction();
2016 
2017  if (AMDGPU::isKernel(CallConv)) {
2018  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2019  OutVals, DL, DAG);
2020  }
2021 
2022  bool IsShader = AMDGPU::isShader(CallConv);
2023 
2024  Info->setIfReturnsVoid(Outs.empty());
2025  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2026 
2027  // CCValAssign - represent the assignment of the return value to a location.
2030 
2031  // CCState - Info about the registers and stack slots.
2032  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2033  *DAG.getContext());
2034 
2035  // Analyze outgoing return values.
2036  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2037 
2038  SDValue Flag;
2039  SmallVector<SDValue, 48> RetOps;
2040  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2041 
2042  // Add return address for callable functions.
2043  if (!Info->isEntryFunction()) {
2045  SDValue ReturnAddrReg = CreateLiveInRegister(
2046  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2047 
2048  // FIXME: Should be able to use a vreg here, but need a way to prevent it
2049  // from being allcoated to a CSR.
2050 
2051  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2052  MVT::i64);
2053 
2054  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2055  Flag = Chain.getValue(1);
2056 
2057  RetOps.push_back(PhysReturnAddrReg);
2058  }
2059 
2060  // Copy the result values into the output registers.
2061  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2062  ++I, ++RealRVLocIdx) {
2063  CCValAssign &VA = RVLocs[I];
2064  assert(VA.isRegLoc() && "Can only return in registers!");
2065  // TODO: Partially return in registers if return values don't fit.
2066  SDValue Arg = OutVals[RealRVLocIdx];
2067 
2068  // Copied from other backends.
2069  switch (VA.getLocInfo()) {
2070  case CCValAssign::Full:
2071  break;
2072  case CCValAssign::BCvt:
2073  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2074  break;
2075  case CCValAssign::SExt:
2076  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2077  break;
2078  case CCValAssign::ZExt:
2079  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2080  break;
2081  case CCValAssign::AExt:
2082  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2083  break;
2084  default:
2085  llvm_unreachable("Unknown loc info!");
2086  }
2087 
2088  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2089  Flag = Chain.getValue(1);
2090  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2091  }
2092 
2093  // FIXME: Does sret work properly?
2094  if (!Info->isEntryFunction()) {
2095  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2096  const MCPhysReg *I =
2098  if (I) {
2099  for (; *I; ++I) {
2100  if (AMDGPU::SReg_64RegClass.contains(*I))
2101  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2102  else if (AMDGPU::SReg_32RegClass.contains(*I))
2103  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2104  else
2105  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2106  }
2107  }
2108  }
2109 
2110  // Update chain and glue.
2111  RetOps[0] = Chain;
2112  if (Flag.getNode())
2113  RetOps.push_back(Flag);
2114 
2115  unsigned Opc = AMDGPUISD::ENDPGM;
2116  if (!IsWaveEnd)
2118  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2119 }
2120 
2122  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2123  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2124  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2125  SDValue ThisVal) const {
2126  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2127 
2128  // Assign locations to each value returned by this call.
2130  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2131  *DAG.getContext());
2132  CCInfo.AnalyzeCallResult(Ins, RetCC);
2133 
2134  // Copy all of the result registers out of their specified physreg.
2135  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2136  CCValAssign VA = RVLocs[i];
2137  SDValue Val;
2138 
2139  if (VA.isRegLoc()) {
2140  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2141  Chain = Val.getValue(1);
2142  InFlag = Val.getValue(2);
2143  } else if (VA.isMemLoc()) {
2144  report_fatal_error("TODO: return values in memory");
2145  } else
2146  llvm_unreachable("unknown argument location type");
2147 
2148  switch (VA.getLocInfo()) {
2149  case CCValAssign::Full:
2150  break;
2151  case CCValAssign::BCvt:
2152  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2153  break;
2154  case CCValAssign::ZExt:
2155  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2156  DAG.getValueType(VA.getValVT()));
2157  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2158  break;
2159  case CCValAssign::SExt:
2160  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2161  DAG.getValueType(VA.getValVT()));
2162  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2163  break;
2164  case CCValAssign::AExt:
2165  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2166  break;
2167  default:
2168  llvm_unreachable("Unknown loc info!");
2169  }
2170 
2171  InVals.push_back(Val);
2172  }
2173 
2174  return Chain;
2175 }
2176 
2177 // Add code to pass special inputs required depending on used features separate
2178 // from the explicit user arguments present in the IR.
2180  CallLoweringInfo &CLI,
2181  CCState &CCInfo,
2182  const SIMachineFunctionInfo &Info,
2183  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2184  SmallVectorImpl<SDValue> &MemOpChains,
2185  SDValue Chain) const {
2186  // If we don't have a call site, this was a call inserted by
2187  // legalization. These can never use special inputs.
2188  if (!CLI.CS)
2189  return;
2190 
2191  const Function *CalleeFunc = CLI.CS.getCalledFunction();
2192  assert(CalleeFunc);
2193 
2194  SelectionDAG &DAG = CLI.DAG;
2195  const SDLoc &DL = CLI.DL;
2196 
2197  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2198 
2199  auto &ArgUsageInfo =
2201  const AMDGPUFunctionArgInfo &CalleeArgInfo
2202  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2203 
2204  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2205 
2206  // TODO: Unify with private memory register handling. This is complicated by
2207  // the fact that at least in kernels, the input argument is not necessarily
2208  // in the same location as the input.
2221  };
2222 
2223  for (auto InputID : InputRegs) {
2224  const ArgDescriptor *OutgoingArg;
2225  const TargetRegisterClass *ArgRC;
2226 
2227  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2228  if (!OutgoingArg)
2229  continue;
2230 
2231  const ArgDescriptor *IncomingArg;
2232  const TargetRegisterClass *IncomingArgRC;
2233  std::tie(IncomingArg, IncomingArgRC)
2234  = CallerArgInfo.getPreloadedValue(InputID);
2235  assert(IncomingArgRC == ArgRC);
2236 
2237  // All special arguments are ints for now.
2238  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2239  SDValue InputReg;
2240 
2241  if (IncomingArg) {
2242  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2243  } else {
2244  // The implicit arg ptr is special because it doesn't have a corresponding
2245  // input for kernels, and is computed from the kernarg segment pointer.
2246  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2247  InputReg = getImplicitArgPtr(DAG, DL);
2248  }
2249 
2250  if (OutgoingArg->isRegister()) {
2251  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2252  } else {
2253  unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2254  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2255  SpecialArgOffset);
2256  MemOpChains.push_back(ArgStore);
2257  }
2258  }
2259 }
2260 
2262  return CC == CallingConv::Fast;
2263 }
2264 
2265 /// Return true if we might ever do TCO for calls with this calling convention.
2267  switch (CC) {
2268  case CallingConv::C:
2269  return true;
2270  default:
2271  return canGuaranteeTCO(CC);
2272  }
2273 }
2274 
2276  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2277  const SmallVectorImpl<ISD::OutputArg> &Outs,
2278  const SmallVectorImpl<SDValue> &OutVals,
2279  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2280  if (!mayTailCallThisCC(CalleeCC))
2281  return false;
2282 
2283  MachineFunction &MF = DAG.getMachineFunction();
2284  const Function &CallerF = MF.getFunction();
2285  CallingConv::ID CallerCC = CallerF.getCallingConv();
2287  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2288 
2289  // Kernels aren't callable, and don't have a live in return address so it
2290  // doesn't make sense to do a tail call with entry functions.
2291  if (!CallerPreserved)
2292  return false;
2293 
2294  bool CCMatch = CallerCC == CalleeCC;
2295 
2297  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2298  return true;
2299  return false;
2300  }
2301 
2302  // TODO: Can we handle var args?
2303  if (IsVarArg)
2304  return false;
2305 
2306  for (const Argument &Arg : CallerF.args()) {
2307  if (Arg.hasByValAttr())
2308  return false;
2309  }
2310 
2311  LLVMContext &Ctx = *DAG.getContext();
2312 
2313  // Check that the call results are passed in the same way.
2314  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2315  CCAssignFnForCall(CalleeCC, IsVarArg),
2316  CCAssignFnForCall(CallerCC, IsVarArg)))
2317  return false;
2318 
2319  // The callee has to preserve all registers the caller needs to preserve.
2320  if (!CCMatch) {
2321  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2322  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2323  return false;
2324  }
2325 
2326  // Nothing more to check if the callee is taking no arguments.
2327  if (Outs.empty())
2328  return true;
2329 
2331  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2332 
2333  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2334 
2335  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2336  // If the stack arguments for this call do not fit into our own save area then
2337  // the call cannot be made tail.
2338  // TODO: Is this really necessary?
2339  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2340  return false;
2341 
2342  const MachineRegisterInfo &MRI = MF.getRegInfo();
2343  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2344 }
2345 
2347  if (!CI->isTailCall())
2348  return false;
2349 
2350  const Function *ParentFn = CI->getParent()->getParent();
2351  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2352  return false;
2353 
2354  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2355  return (Attr.getValueAsString() != "true");
2356 }
2357 
2358 // The wave scratch offset register is used as the global base pointer.
2360  SmallVectorImpl<SDValue> &InVals) const {
2361  SelectionDAG &DAG = CLI.DAG;
2362  const SDLoc &DL = CLI.DL;
2364  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2366  SDValue Chain = CLI.Chain;
2367  SDValue Callee = CLI.Callee;
2368  bool &IsTailCall = CLI.IsTailCall;
2369  CallingConv::ID CallConv = CLI.CallConv;
2370  bool IsVarArg = CLI.IsVarArg;
2371  bool IsSibCall = false;
2372  bool IsThisReturn = false;
2373  MachineFunction &MF = DAG.getMachineFunction();
2374 
2375  if (IsVarArg) {
2376  return lowerUnhandledCall(CLI, InVals,
2377  "unsupported call to variadic function ");
2378  }
2379 
2380  if (!CLI.CS.getInstruction())
2381  report_fatal_error("unsupported libcall legalization");
2382 
2383  if (!CLI.CS.getCalledFunction()) {
2384  return lowerUnhandledCall(CLI, InVals,
2385  "unsupported indirect call to function ");
2386  }
2387 
2388  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2389  return lowerUnhandledCall(CLI, InVals,
2390  "unsupported required tail call to function ");
2391  }
2392 
2394  // Note the issue is with the CC of the calling function, not of the call
2395  // itself.
2396  return lowerUnhandledCall(CLI, InVals,
2397  "unsupported call from graphics shader of function ");
2398  }
2399 
2400  // The first 4 bytes are reserved for the callee's emergency stack slot.
2401  if (IsTailCall) {
2402  IsTailCall = isEligibleForTailCallOptimization(
2403  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2404  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2405  report_fatal_error("failed to perform tail call elimination on a call "
2406  "site marked musttail");
2407  }
2408 
2409  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2410 
2411  // A sibling call is one where we're under the usual C ABI and not planning
2412  // to change that but can still do a tail call:
2413  if (!TailCallOpt && IsTailCall)
2414  IsSibCall = true;
2415 
2416  if (IsTailCall)
2417  ++NumTailCalls;
2418  }
2419 
2421 
2422  // Analyze operands of the call, assigning locations to each operand.
2424  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2425  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2426 
2427  // The first 4 bytes are reserved for the callee's emergency stack slot.
2428  CCInfo.AllocateStack(4, 4);
2429 
2430  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2431 
2432  // Get a count of how many bytes are to be pushed on the stack.
2433  unsigned NumBytes = CCInfo.getNextStackOffset();
2434 
2435  if (IsSibCall) {
2436  // Since we're not changing the ABI to make this a tail call, the memory
2437  // operands are already available in the caller's incoming argument space.
2438  NumBytes = 0;
2439  }
2440 
2441  // FPDiff is the byte offset of the call's argument area from the callee's.
2442  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2443  // by this amount for a tail call. In a sibling call it must be 0 because the
2444  // caller will deallocate the entire stack and the callee still expects its
2445  // arguments to begin at SP+0. Completely unused for non-tail calls.
2446  int32_t FPDiff = 0;
2447  MachineFrameInfo &MFI = MF.getFrameInfo();
2449 
2450  SDValue CallerSavedFP;
2451 
2452  // Adjust the stack pointer for the new arguments...
2453  // These operations are automatically eliminated by the prolog/epilog pass
2454  if (!IsSibCall) {
2455  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2456 
2457  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2458 
2459  // In the HSA case, this should be an identity copy.
2460  SDValue ScratchRSrcReg
2461  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2462  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2463 
2464  // TODO: Don't hardcode these registers and get from the callee function.
2465  SDValue ScratchWaveOffsetReg
2466  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2467  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2468 
2469  if (!Info->isEntryFunction()) {
2470  // Avoid clobbering this function's FP value. In the current convention
2471  // callee will overwrite this, so do save/restore around the call site.
2472  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2473  Info->getFrameOffsetReg(), MVT::i32);
2474  }
2475  }
2476 
2477  SmallVector<SDValue, 8> MemOpChains;
2478  MVT PtrVT = MVT::i32;
2479 
2480  // Walk the register/memloc assignments, inserting copies/loads.
2481  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2482  ++i, ++realArgIdx) {
2483  CCValAssign &VA = ArgLocs[i];
2484  SDValue Arg = OutVals[realArgIdx];
2485 
2486  // Promote the value if needed.
2487  switch (VA.getLocInfo()) {
2488  case CCValAssign::Full:
2489  break;
2490  case CCValAssign::BCvt:
2491  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2492  break;
2493  case CCValAssign::ZExt:
2494  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2495  break;
2496  case CCValAssign::SExt:
2497  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2498  break;
2499  case CCValAssign::AExt:
2500  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2501  break;
2502  case CCValAssign::FPExt:
2503  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2504  break;
2505  default:
2506  llvm_unreachable("Unknown loc info!");
2507  }
2508 
2509  if (VA.isRegLoc()) {
2510  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2511  } else {
2512  assert(VA.isMemLoc());
2513 
2514  SDValue DstAddr;
2515  MachinePointerInfo DstInfo;
2516 
2517  unsigned LocMemOffset = VA.getLocMemOffset();
2518  int32_t Offset = LocMemOffset;
2519 
2520  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2521  unsigned Align = 0;
2522 
2523  if (IsTailCall) {
2524  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2525  unsigned OpSize = Flags.isByVal() ?
2526  Flags.getByValSize() : VA.getValVT().getStoreSize();
2527 
2528  // FIXME: We can have better than the minimum byval required alignment.
2529  Align = Flags.isByVal() ? Flags.getByValAlign() :
2530  MinAlign(Subtarget->getStackAlignment(), Offset);
2531 
2532  Offset = Offset + FPDiff;
2533  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2534 
2535  DstAddr = DAG.getFrameIndex(FI, PtrVT);
2536  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2537 
2538  // Make sure any stack arguments overlapping with where we're storing
2539  // are loaded before this eventual operation. Otherwise they'll be
2540  // clobbered.
2541 
2542  // FIXME: Why is this really necessary? This seems to just result in a
2543  // lot of code to copy the stack and write them back to the same
2544  // locations, which are supposed to be immutable?
2545  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2546  } else {
2547  DstAddr = PtrOff;
2548  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2549  Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2550  }
2551 
2552  if (Outs[i].Flags.isByVal()) {
2553  SDValue SizeNode =
2554  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2555  SDValue Cpy = DAG.getMemcpy(
2556  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2557  /*isVol = */ false, /*AlwaysInline = */ true,
2558  /*isTailCall = */ false, DstInfo,
2561 
2562  MemOpChains.push_back(Cpy);
2563  } else {
2564  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2565  MemOpChains.push_back(Store);
2566  }
2567  }
2568  }
2569 
2570  // Copy special input registers after user input arguments.
2571  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2572 
2573  if (!MemOpChains.empty())
2574  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2575 
2576  // Build a sequence of copy-to-reg nodes chained together with token chain
2577  // and flag operands which copy the outgoing args into the appropriate regs.
2578  SDValue InFlag;
2579  for (auto &RegToPass : RegsToPass) {
2580  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2581  RegToPass.second, InFlag);
2582  InFlag = Chain.getValue(1);
2583  }
2584 
2585 
2586  SDValue PhysReturnAddrReg;
2587  if (IsTailCall) {
2588  // Since the return is being combined with the call, we need to pass on the
2589  // return address.
2590 
2592  SDValue ReturnAddrReg = CreateLiveInRegister(
2593  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2594 
2595  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2596  MVT::i64);
2597  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2598  InFlag = Chain.getValue(1);
2599  }
2600 
2601  // We don't usually want to end the call-sequence here because we would tidy
2602  // the frame up *after* the call, however in the ABI-changing tail-call case
2603  // we've carefully laid out the parameters so that when sp is reset they'll be
2604  // in the correct location.
2605  if (IsTailCall && !IsSibCall) {
2606  Chain = DAG.getCALLSEQ_END(Chain,
2607  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2608  DAG.getTargetConstant(0, DL, MVT::i32),
2609  InFlag, DL);
2610  InFlag = Chain.getValue(1);
2611  }
2612 
2613  std::vector<SDValue> Ops;
2614  Ops.push_back(Chain);
2615  Ops.push_back(Callee);
2616 
2617  if (IsTailCall) {
2618  // Each tail call may have to adjust the stack by a different amount, so
2619  // this information must travel along with the operation for eventual
2620  // consumption by emitEpilogue.
2621  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2622 
2623  Ops.push_back(PhysReturnAddrReg);
2624  }
2625 
2626  // Add argument registers to the end of the list so that they are known live
2627  // into the call.
2628  for (auto &RegToPass : RegsToPass) {
2629  Ops.push_back(DAG.getRegister(RegToPass.first,
2630  RegToPass.second.getValueType()));
2631  }
2632 
2633  // Add a register mask operand representing the call-preserved registers.
2634 
2635  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2636  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2637  assert(Mask && "Missing call preserved mask for calling convention");
2638  Ops.push_back(DAG.getRegisterMask(Mask));
2639 
2640  if (InFlag.getNode())
2641  Ops.push_back(InFlag);
2642 
2643  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2644 
2645  // If we're doing a tall call, use a TC_RETURN here rather than an
2646  // actual call instruction.
2647  if (IsTailCall) {
2648  MFI.setHasTailCall();
2649  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2650  }
2651 
2652  // Returns a chain and a flag for retval copy to use.
2653  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2654  Chain = Call.getValue(0);
2655  InFlag = Call.getValue(1);
2656 
2657  if (CallerSavedFP) {
2658  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2659  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2660  InFlag = Chain.getValue(1);
2661  }
2662 
2663  uint64_t CalleePopBytes = NumBytes;
2664  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2665  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2666  InFlag, DL);
2667  if (!Ins.empty())
2668  InFlag = Chain.getValue(1);
2669 
2670  // Handle result values, copying them out of physregs into vregs that we
2671  // return.
2672  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2673  InVals, IsThisReturn,
2674  IsThisReturn ? OutVals[0] : SDValue());
2675 }
2676 
2677 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2678  SelectionDAG &DAG) const {
2679  unsigned Reg = StringSwitch<unsigned>(RegName)
2680  .Case("m0", AMDGPU::M0)
2681  .Case("exec", AMDGPU::EXEC)
2682  .Case("exec_lo", AMDGPU::EXEC_LO)
2683  .Case("exec_hi", AMDGPU::EXEC_HI)
2684  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2685  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2686  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2687  .Default(AMDGPU::NoRegister);
2688 
2689  if (Reg == AMDGPU::NoRegister) {
2690  report_fatal_error(Twine("invalid register name \""
2691  + StringRef(RegName) + "\"."));
2692 
2693  }
2694 
2695  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2696  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2697  report_fatal_error(Twine("invalid register \""
2698  + StringRef(RegName) + "\" for subtarget."));
2699  }
2700 
2701  switch (Reg) {
2702  case AMDGPU::M0:
2703  case AMDGPU::EXEC_LO:
2704  case AMDGPU::EXEC_HI:
2705  case AMDGPU::FLAT_SCR_LO:
2706  case AMDGPU::FLAT_SCR_HI:
2707  if (VT.getSizeInBits() == 32)
2708  return Reg;
2709  break;
2710  case AMDGPU::EXEC:
2711  case AMDGPU::FLAT_SCR:
2712  if (VT.getSizeInBits() == 64)
2713  return Reg;
2714  break;
2715  default:
2716  llvm_unreachable("missing register type checking");
2717  }
2718 
2719  report_fatal_error(Twine("invalid type for register \""
2720  + StringRef(RegName) + "\"."));
2721 }
2722 
2723 // If kill is not the last instruction, split the block so kill is always a
2724 // proper terminator.
2726  MachineBasicBlock *BB) const {
2727  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2728 
2729  MachineBasicBlock::iterator SplitPoint(&MI);
2730  ++SplitPoint;
2731 
2732  if (SplitPoint == BB->end()) {
2733  // Don't bother with a new block.
2735  return BB;
2736  }
2737 
2738  MachineFunction *MF = BB->getParent();
2739  MachineBasicBlock *SplitBB
2741 
2742  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2743  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2744 
2745  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2746  BB->addSuccessor(SplitBB);
2747 
2749  return SplitBB;
2750 }
2751 
2752 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2753 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2754 // will only do one iteration. In the worst case, this will loop 64 times.
2755 //
2756 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2758  const SIInstrInfo *TII,
2760  MachineBasicBlock &OrigBB,
2761  MachineBasicBlock &LoopBB,
2762  const DebugLoc &DL,
2763  const MachineOperand &IdxReg,
2764  unsigned InitReg,
2765  unsigned ResultReg,
2766  unsigned PhiReg,
2767  unsigned InitSaveExecReg,
2768  int Offset,
2769  bool UseGPRIdxMode,
2770  bool IsIndirectSrc) {
2771  MachineBasicBlock::iterator I = LoopBB.begin();
2772 
2773  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2774  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2775  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2776  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2777 
2778  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2779  .addReg(InitReg)
2780  .addMBB(&OrigBB)
2781  .addReg(ResultReg)
2782  .addMBB(&LoopBB);
2783 
2784  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2785  .addReg(InitSaveExecReg)
2786  .addMBB(&OrigBB)
2787  .addReg(NewExec)
2788  .addMBB(&LoopBB);
2789 
2790  // Read the next variant <- also loop target.
2791  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2792  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2793 
2794  // Compare the just read M0 value to all possible Idx values.
2795  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2796  .addReg(CurrentIdxReg)
2797  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2798 
2799  // Update EXEC, save the original EXEC value to VCC.
2800  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2801  .addReg(CondReg, RegState::Kill);
2802 
2803  MRI.setSimpleHint(NewExec, CondReg);
2804 
2805  if (UseGPRIdxMode) {
2806  unsigned IdxReg;
2807  if (Offset == 0) {
2808  IdxReg = CurrentIdxReg;
2809  } else {
2810  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2811  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2812  .addReg(CurrentIdxReg, RegState::Kill)
2813  .addImm(Offset);
2814  }
2815  unsigned IdxMode = IsIndirectSrc ?
2817  MachineInstr *SetOn =
2818  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2819  .addReg(IdxReg, RegState::Kill)
2820  .addImm(IdxMode);
2821  SetOn->getOperand(3).setIsUndef();
2822  } else {
2823  // Move index from VCC into M0
2824  if (Offset == 0) {
2825  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2826  .addReg(CurrentIdxReg, RegState::Kill);
2827  } else {
2828  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2829  .addReg(CurrentIdxReg, RegState::Kill)
2830  .addImm(Offset);
2831  }
2832  }
2833 
2834  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2835  MachineInstr *InsertPt =
2836  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2837  .addReg(AMDGPU::EXEC)
2838  .addReg(NewExec);
2839 
2840  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2841  // s_cbranch_scc0?
2842 
2843  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2844  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2845  .addMBB(&LoopBB);
2846 
2847  return InsertPt->getIterator();
2848 }
2849 
2850 // This has slightly sub-optimal regalloc when the source vector is killed by
2851 // the read. The register allocator does not understand that the kill is
2852 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2853 // subregister from it, using 1 more VGPR than necessary. This was saved when
2854 // this was expanded after register allocation.
2856  MachineBasicBlock &MBB,
2857  MachineInstr &MI,
2858  unsigned InitResultReg,
2859  unsigned PhiReg,
2860  int Offset,
2861  bool UseGPRIdxMode,
2862  bool IsIndirectSrc) {
2863  MachineFunction *MF = MBB.getParent();
2865  const DebugLoc &DL = MI.getDebugLoc();
2867 
2868  unsigned DstReg = MI.getOperand(0).getReg();
2869  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2870  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2871 
2872  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2873 
2874  // Save the EXEC mask
2875  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2876  .addReg(AMDGPU::EXEC);
2877 
2878  // To insert the loop we need to split the block. Move everything after this
2879  // point to a new block, and insert a new empty block between the two.
2881  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2882  MachineFunction::iterator MBBI(MBB);
2883  ++MBBI;
2884 
2885  MF->insert(MBBI, LoopBB);
2886  MF->insert(MBBI, RemainderBB);
2887 
2888  LoopBB->addSuccessor(LoopBB);
2889  LoopBB->addSuccessor(RemainderBB);
2890 
2891  // Move the rest of the block into a new block.
2892  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2893  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2894 
2895  MBB.addSuccessor(LoopBB);
2896 
2897  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2898 
2899  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2900  InitResultReg, DstReg, PhiReg, TmpExec,
2901  Offset, UseGPRIdxMode, IsIndirectSrc);
2902 
2903  MachineBasicBlock::iterator First = RemainderBB->begin();
2904  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2905  .addReg(SaveExec);
2906 
2907  return InsPt;
2908 }
2909 
2910 // Returns subreg index, offset
2911 static std::pair<unsigned, int>
2913  const TargetRegisterClass *SuperRC,
2914  unsigned VecReg,
2915  int Offset) {
2916  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2917 
2918  // Skip out of bounds offsets, or else we would end up using an undefined
2919  // register.
2920  if (Offset >= NumElts || Offset < 0)
2921  return std::make_pair(AMDGPU::sub0, Offset);
2922 
2923  return std::make_pair(AMDGPU::sub0 + Offset, 0);
2924 }
2925 
2926 // Return true if the index is an SGPR and was set.
2929  MachineInstr &MI,
2930  int Offset,
2931  bool UseGPRIdxMode,
2932  bool IsIndirectSrc) {
2933  MachineBasicBlock *MBB = MI.getParent();
2934  const DebugLoc &DL = MI.getDebugLoc();
2936 
2937  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2938  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2939 
2940  assert(Idx->getReg() != AMDGPU::NoRegister);
2941 
2942  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2943  return false;
2944 
2945  if (UseGPRIdxMode) {
2946  unsigned IdxMode = IsIndirectSrc ?
2948  if (Offset == 0) {
2949  MachineInstr *SetOn =
2950  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2951  .add(*Idx)
2952  .addImm(IdxMode);
2953 
2954  SetOn->getOperand(3).setIsUndef();
2955  } else {
2956  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2957  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2958  .add(*Idx)
2959  .addImm(Offset);
2960  MachineInstr *SetOn =
2961  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2962  .addReg(Tmp, RegState::Kill)
2963  .addImm(IdxMode);
2964 
2965  SetOn->getOperand(3).setIsUndef();
2966  }
2967 
2968  return true;
2969  }
2970 
2971  if (Offset == 0) {
2972  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2973  .add(*Idx);
2974  } else {
2975  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2976  .add(*Idx)
2977  .addImm(Offset);
2978  }
2979 
2980  return true;
2981 }
2982 
2983 // Control flow needs to be inserted if indexing with a VGPR.
2985  MachineBasicBlock &MBB,
2986  const GCNSubtarget &ST) {
2987  const SIInstrInfo *TII = ST.getInstrInfo();
2988  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2989  MachineFunction *MF = MBB.getParent();
2991 
2992  unsigned Dst = MI.getOperand(0).getReg();
2993  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2994  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2995 
2996  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2997 
2998  unsigned SubReg;
2999  std::tie(SubReg, Offset)
3000  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3001 
3002  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3003 
3004  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3006  const DebugLoc &DL = MI.getDebugLoc();
3007 
3008  if (UseGPRIdxMode) {
3009  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3010  // to avoid interfering with other uses, so probably requires a new
3011  // optimization pass.
3012  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3013  .addReg(SrcReg, RegState::Undef, SubReg)
3014  .addReg(SrcReg, RegState::Implicit)
3015  .addReg(AMDGPU::M0, RegState::Implicit);
3016  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3017  } else {
3018  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3019  .addReg(SrcReg, RegState::Undef, SubReg)
3020  .addReg(SrcReg, RegState::Implicit);
3021  }
3022 
3023  MI.eraseFromParent();
3024 
3025  return &MBB;
3026  }
3027 
3028  const DebugLoc &DL = MI.getDebugLoc();
3030 
3031  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3032  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3033 
3034  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3035 
3036  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3037  Offset, UseGPRIdxMode, true);
3038  MachineBasicBlock *LoopBB = InsPt->getParent();
3039 
3040  if (UseGPRIdxMode) {
3041  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3042  .addReg(SrcReg, RegState::Undef, SubReg)
3043  .addReg(SrcReg, RegState::Implicit)
3044  .addReg(AMDGPU::M0, RegState::Implicit);
3045  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3046  } else {
3047  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3048  .addReg(SrcReg, RegState::Undef, SubReg)
3049  .addReg(SrcReg, RegState::Implicit);
3050  }
3051 
3052  MI.eraseFromParent();
3053 
3054  return LoopBB;
3055 }
3056 
3057 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3058  const TargetRegisterClass *VecRC) {
3059  switch (TRI.getRegSizeInBits(*VecRC)) {
3060  case 32: // 4 bytes
3061  return AMDGPU::V_MOVRELD_B32_V1;
3062  case 64: // 8 bytes
3063  return AMDGPU::V_MOVRELD_B32_V2;
3064  case 128: // 16 bytes
3065  return AMDGPU::V_MOVRELD_B32_V4;
3066  case 256: // 32 bytes
3067  return AMDGPU::V_MOVRELD_B32_V8;
3068  case 512: // 64 bytes
3069  return AMDGPU::V_MOVRELD_B32_V16;
3070  default:
3071  llvm_unreachable("unsupported size for MOVRELD pseudos");
3072  }
3073 }
3074 
3076  MachineBasicBlock &MBB,
3077  const GCNSubtarget &ST) {
3078  const SIInstrInfo *TII = ST.getInstrInfo();
3079  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3080  MachineFunction *MF = MBB.getParent();
3082 
3083  unsigned Dst = MI.getOperand(0).getReg();
3084  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3085  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3086  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3087  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3088  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3089 
3090  // This can be an immediate, but will be folded later.
3091  assert(Val->getReg());
3092 
3093  unsigned SubReg;
3094  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3095  SrcVec->getReg(),
3096  Offset);
3097  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3098 
3099  if (Idx->getReg() == AMDGPU::NoRegister) {
3101  const DebugLoc &DL = MI.getDebugLoc();
3102 
3103  assert(Offset == 0);
3104 
3105  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3106  .add(*SrcVec)
3107  .add(*Val)
3108  .addImm(SubReg);
3109 
3110  MI.eraseFromParent();
3111  return &MBB;
3112  }
3113 
3114  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3116  const DebugLoc &DL = MI.getDebugLoc();
3117 
3118  if (UseGPRIdxMode) {
3119  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3120  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3121  .add(*Val)
3122  .addReg(Dst, RegState::ImplicitDefine)
3123  .addReg(SrcVec->getReg(), RegState::Implicit)
3124  .addReg(AMDGPU::M0, RegState::Implicit);
3125 
3126  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3127  } else {
3128  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3129 
3130  BuildMI(MBB, I, DL, MovRelDesc)
3131  .addReg(Dst, RegState::Define)
3132  .addReg(SrcVec->getReg())
3133  .add(*Val)
3134  .addImm(SubReg - AMDGPU::sub0);
3135  }
3136 
3137  MI.eraseFromParent();
3138  return &MBB;
3139  }
3140 
3141  if (Val->isReg())
3142  MRI.clearKillFlags(Val->getReg());
3143 
3144  const DebugLoc &DL = MI.getDebugLoc();
3145 
3146  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3147 
3148  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3149  Offset, UseGPRIdxMode, false);
3150  MachineBasicBlock *LoopBB = InsPt->getParent();
3151 
3152  if (UseGPRIdxMode) {
3153  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3154  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3155  .add(*Val) // src0
3157  .addReg(PhiReg, RegState::Implicit)
3158  .addReg(AMDGPU::M0, RegState::Implicit);
3159  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3160  } else {
3161  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3162 
3163  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3164  .addReg(Dst, RegState::Define)
3165  .addReg(PhiReg)
3166  .add(*Val)
3167  .addImm(SubReg - AMDGPU::sub0);
3168  }
3169 
3170  MI.eraseFromParent();
3171 
3172  return LoopBB;
3173 }
3174 
3176  MachineInstr &MI, MachineBasicBlock *BB) const {
3177 
3178  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3179  MachineFunction *MF = BB->getParent();
3181 
3182  if (TII->isMIMG(MI)) {
3183  if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3184  report_fatal_error("missing mem operand from MIMG instruction");
3185  }
3186  // Add a memoperand for mimg instructions so that they aren't assumed to
3187  // be ordered memory instuctions.
3188 
3189  return BB;
3190  }
3191 
3192  switch (MI.getOpcode()) {
3193  case AMDGPU::S_ADD_U64_PSEUDO:
3194  case AMDGPU::S_SUB_U64_PSEUDO: {
3196  const DebugLoc &DL = MI.getDebugLoc();
3197 
3198  MachineOperand &Dest = MI.getOperand(0);
3199  MachineOperand &Src0 = MI.getOperand(1);
3200  MachineOperand &Src1 = MI.getOperand(2);
3201 
3202  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3203  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3204 
3205  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3206  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3207  &AMDGPU::SReg_32_XM0RegClass);
3208  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3209  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3210  &AMDGPU::SReg_32_XM0RegClass);
3211 
3212  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3213  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3214  &AMDGPU::SReg_32_XM0RegClass);
3215  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3216  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3217  &AMDGPU::SReg_32_XM0RegClass);
3218 
3219  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3220 
3221  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3222  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3223  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3224  .add(Src0Sub0)
3225  .add(Src1Sub0);
3226  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3227  .add(Src0Sub1)
3228  .add(Src1Sub1);
3229  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3230  .addReg(DestSub0)
3231  .addImm(AMDGPU::sub0)
3232  .addReg(DestSub1)
3233  .addImm(AMDGPU::sub1);
3234  MI.eraseFromParent();
3235  return BB;
3236  }
3237  case AMDGPU::SI_INIT_M0: {
3238  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3239  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3240  .add(MI.getOperand(0));
3241  MI.eraseFromParent();
3242  return BB;
3243  }
3244  case AMDGPU::SI_INIT_EXEC:
3245  // This should be before all vector instructions.
3246  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3247  AMDGPU::EXEC)
3248  .addImm(MI.getOperand(0).getImm());
3249  MI.eraseFromParent();
3250  return BB;
3251 
3252  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3253  // Extract the thread count from an SGPR input and set EXEC accordingly.
3254  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3255  //
3256  // S_BFE_U32 count, input, {shift, 7}
3257  // S_BFM_B64 exec, count, 0
3258  // S_CMP_EQ_U32 count, 64
3259  // S_CMOV_B64 exec, -1
3260  MachineInstr *FirstMI = &*BB->begin();
3262  unsigned InputReg = MI.getOperand(0).getReg();
3263  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3264  bool Found = false;
3265 
3266  // Move the COPY of the input reg to the beginning, so that we can use it.
3267  for (auto I = BB->begin(); I != &MI; I++) {
3268  if (I->getOpcode() != TargetOpcode::COPY ||
3269  I->getOperand(0).getReg() != InputReg)
3270  continue;
3271 
3272  if (I == FirstMI) {
3273  FirstMI = &*++BB->begin();
3274  } else {
3275  I->removeFromParent();
3276  BB->insert(FirstMI, &*I);
3277  }
3278  Found = true;
3279  break;
3280  }
3281  assert(Found);
3282  (void)Found;
3283 
3284  // This should be before all vector instructions.
3285  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3286  .addReg(InputReg)
3287  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3288  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3289  AMDGPU::EXEC)
3290  .addReg(CountReg)
3291  .addImm(0);
3292  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3293  .addReg(CountReg, RegState::Kill)
3294  .addImm(64);
3295  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3296  AMDGPU::EXEC)
3297  .addImm(-1);
3298  MI.eraseFromParent();
3299  return BB;
3300  }
3301 
3302  case AMDGPU::GET_GROUPSTATICSIZE: {
3303  DebugLoc DL = MI.getDebugLoc();
3304  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3305  .add(MI.getOperand(0))
3306  .addImm(MFI->getLDSSize());
3307  MI.eraseFromParent();
3308  return BB;
3309  }
3310  case AMDGPU::SI_INDIRECT_SRC_V1:
3311  case AMDGPU::SI_INDIRECT_SRC_V2:
3312  case AMDGPU::SI_INDIRECT_SRC_V4:
3313  case AMDGPU::SI_INDIRECT_SRC_V8:
3314  case AMDGPU::SI_INDIRECT_SRC_V16:
3315  return emitIndirectSrc(MI, *BB, *getSubtarget());
3316  case AMDGPU::SI_INDIRECT_DST_V1:
3317  case AMDGPU::SI_INDIRECT_DST_V2:
3318  case AMDGPU::SI_INDIRECT_DST_V4:
3319  case AMDGPU::SI_INDIRECT_DST_V8:
3320  case AMDGPU::SI_INDIRECT_DST_V16:
3321  return emitIndirectDst(MI, *BB, *getSubtarget());
3322  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3323  case AMDGPU::SI_KILL_I1_PSEUDO:
3324  return splitKillBlock(MI, BB);
3325  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3327 
3328  unsigned Dst = MI.getOperand(0).getReg();
3329  unsigned Src0 = MI.getOperand(1).getReg();
3330  unsigned Src1 = MI.getOperand(2).getReg();
3331  const DebugLoc &DL = MI.getDebugLoc();
3332  unsigned SrcCond = MI.getOperand(3).getReg();
3333 
3334  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3335  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3336  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3337 
3338  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3339  .addReg(SrcCond);
3340  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3341  .addReg(Src0, 0, AMDGPU::sub0)
3342  .addReg(Src1, 0, AMDGPU::sub0)
3343  .addReg(SrcCondCopy);
3344  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3345  .addReg(Src0, 0, AMDGPU::sub1)
3346  .addReg(Src1, 0, AMDGPU::sub1)
3347  .addReg(SrcCondCopy);
3348 
3349  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3350  .addReg(DstLo)
3351  .addImm(AMDGPU::sub0)
3352  .addReg(DstHi)
3353  .addImm(AMDGPU::sub1);
3354  MI.eraseFromParent();
3355  return BB;
3356  }
3357  case AMDGPU::SI_BR_UNDEF: {
3358  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3359  const DebugLoc &DL = MI.getDebugLoc();
3360  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3361  .add(MI.getOperand(0));
3362  Br->getOperand(1).setIsUndef(true); // read undef SCC
3363  MI.eraseFromParent();
3364  return BB;
3365  }
3366  case AMDGPU::ADJCALLSTACKUP:
3367  case AMDGPU::ADJCALLSTACKDOWN: {
3369  MachineInstrBuilder MIB(*MF, &MI);
3370 
3371  // Add an implicit use of the frame offset reg to prevent the restore copy
3372  // inserted after the call from being reorderd after stack operations in the
3373  // the caller's frame.
3374  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3375  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3376  .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3377  return BB;
3378  }
3379  case AMDGPU::SI_CALL_ISEL:
3380  case AMDGPU::SI_TCRETURN_ISEL: {
3381  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3382  const DebugLoc &DL = MI.getDebugLoc();
3383  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3384 
3386  unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3387  MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3388  assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3389 
3390  const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3391 
3392  MachineInstrBuilder MIB;
3393  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3394  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3395  .add(MI.getOperand(0))
3396  .addGlobalAddress(G);
3397  } else {
3398  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3399  .add(MI.getOperand(0))
3400  .addGlobalAddress(G);
3401 
3402  // There is an additional imm operand for tcreturn, but it should be in the
3403  // right place already.
3404  }
3405 
3406  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3407  MIB.add(MI.getOperand(I));
3408 
3409  MIB.cloneMemRefs(MI);
3410  MI.eraseFromParent();
3411  return BB;
3412  }
3413  default:
3415  }
3416 }
3417 
3419  return isTypeLegal(VT.getScalarType());
3420 }
3421 
3423  // This currently forces unfolding various combinations of fsub into fma with
3424  // free fneg'd operands. As long as we have fast FMA (controlled by
3425  // isFMAFasterThanFMulAndFAdd), we should perform these.
3426 
3427  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3428  // most of these combines appear to be cycle neutral but save on instruction
3429  // count / code size.
3430  return true;
3431 }
3432 
3434  EVT VT) const {
3435  if (!VT.isVector()) {
3436  return MVT::i1;
3437  }
3438  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3439 }
3440 
3442  // TODO: Should i16 be used always if legal? For now it would force VALU
3443  // shifts.
3444  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3445 }
3446 
3447 // Answering this is somewhat tricky and depends on the specific device which
3448 // have different rates for fma or all f64 operations.
3449 //
3450 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3451 // regardless of which device (although the number of cycles differs between
3452 // devices), so it is always profitable for f64.
3453 //
3454 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3455 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3456 // which we can always do even without fused FP ops since it returns the same
3457 // result as the separate operations and since it is always full
3458 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3459 // however does not support denormals, so we do report fma as faster if we have
3460 // a fast fma device and require denormals.
3461 //
3463  VT = VT.getScalarType();
3464 
3465  switch (VT.getSimpleVT().SimpleTy) {
3466  case MVT::f32: {
3467  // This is as fast on some subtargets. However, we always have full rate f32
3468  // mad available which returns the same result as the separate operations
3469  // which we should prefer over fma. We can't use this if we want to support
3470  // denormals, so only report this in these cases.
3471  if (Subtarget->hasFP32Denormals())
3472  return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3473 
3474  // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3475  return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3476  }
3477  case MVT::f64:
3478  return true;
3479  case MVT::f16:
3480  return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3481  default:
3482  break;
3483  }
3484 
3485  return false;
3486 }
3487 
3488 //===----------------------------------------------------------------------===//
3489 // Custom DAG Lowering Operations
3490 //===----------------------------------------------------------------------===//
3491 
3492 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3493 // wider vector type is legal.
3495  SelectionDAG &DAG) const {
3496  unsigned Opc = Op.getOpcode();
3497  EVT VT = Op.getValueType();
3498  assert(VT == MVT::v4f16);
3499 
3500  SDValue Lo, Hi;
3501  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3502 
3503  SDLoc SL(Op);
3504  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3505  Op->getFlags());
3506  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3507  Op->getFlags());
3508 
3509  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3510 }
3511 
3512 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3513 // wider vector type is legal.
3515  SelectionDAG &DAG) const {
3516  unsigned Opc = Op.getOpcode();
3517  EVT VT = Op.getValueType();
3518  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3519 
3520  SDValue Lo0, Hi0;
3521  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3522  SDValue Lo1, Hi1;
3523  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3524 
3525  SDLoc SL(Op);
3526 
3527  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3528  Op->getFlags());
3529  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3530  Op->getFlags());
3531 
3532  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3533 }
3534 
3536  switch (Op.getOpcode()) {
3537  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3538  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3539  case ISD::LOAD: {
3540  SDValue Result = LowerLOAD(Op, DAG);
3541  assert((!Result.getNode() ||
3542  Result.getNode()->getNumValues() == 2) &&
3543  "Load should return a value and a chain");
3544  return Result;
3545  }
3546 
3547  case ISD::FSIN:
3548  case ISD::FCOS:
3549  return LowerTrig(Op, DAG);
3550  case ISD::SELECT: return LowerSELECT(Op, DAG);
3551  case ISD::FDIV: return LowerFDIV(Op, DAG);
3552  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3553  case ISD::STORE: return LowerSTORE(Op, DAG);
3554  case ISD::GlobalAddress: {
3555  MachineFunction &MF = DAG.getMachineFunction();
3557  return LowerGlobalAddress(MFI, Op, DAG);
3558  }
3559  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3560  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3561  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3562  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3564  return lowerINSERT_VECTOR_ELT(Op, DAG);
3566  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3567  case ISD::BUILD_VECTOR:
3568  return lowerBUILD_VECTOR(Op, DAG);
3569  case ISD::FP_ROUND:
3570  return lowerFP_ROUND(Op, DAG);
3571  case ISD::TRAP:
3572  return lowerTRAP(Op, DAG);
3573  case ISD::DEBUGTRAP:
3574  return lowerDEBUGTRAP(Op, DAG);
3575  case ISD::FABS:
3576  case ISD::FNEG:
3577  case ISD::FCANONICALIZE:
3578  return splitUnaryVectorOp(Op, DAG);
3579  case ISD::SHL:
3580  case ISD::SRA:
3581  case ISD::SRL:
3582  case ISD::ADD:
3583  case ISD::SUB:
3584  case ISD::MUL:
3585  case ISD::SMIN:
3586  case ISD::SMAX:
3587  case ISD::UMIN:
3588  case ISD::UMAX:
3589  case ISD::FMINNUM:
3590  case ISD::FMAXNUM:
3591  case ISD::FADD:
3592  case ISD::FMUL:
3593  return splitBinaryVectorOp(Op, DAG);
3594  }
3595  return SDValue();
3596 }
3597 
3599  const SDLoc &DL,
3600  SelectionDAG &DAG, bool Unpacked) {
3601  if (!LoadVT.isVector())
3602  return Result;
3603 
3604  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3605  // Truncate to v2i16/v4i16.
3606  EVT IntLoadVT = LoadVT.changeTypeToInteger();
3607 
3608  // Workaround legalizer not scalarizing truncate after vector op
3609  // legalization byt not creating intermediate vector trunc.
3611  DAG.ExtractVectorElements(Result, Elts);
3612  for (SDValue &Elt : Elts)
3613  Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3614 
3615  Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3616 
3617  // Bitcast to original type (v2f16/v4f16).
3618  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3619  }
3620 
3621  // Cast back to the original packed type.
3622  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3623 }
3624 
3625 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3626  MemSDNode *M,
3627  SelectionDAG &DAG,
3628  ArrayRef<SDValue> Ops,
3629  bool IsIntrinsic) const {
3630  SDLoc DL(M);
3631 
3632  bool Unpacked = Subtarget->hasUnpackedD16VMem();
3633  EVT LoadVT = M->getValueType(0);
3634 
3635  EVT EquivLoadVT = LoadVT;
3636  if (Unpacked && LoadVT.isVector()) {
3637  EquivLoadVT = LoadVT.isVector() ?
3639  LoadVT.getVectorNumElements()) : LoadVT;
3640  }
3641 
3642  // Change from v4f16/v2f16 to EquivLoadVT.
3643  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3644 
3645  SDValue Load
3646  = DAG.getMemIntrinsicNode(
3647  IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3648  VTList, Ops, M->getMemoryVT(),
3649  M->getMemOperand());
3650  if (!Unpacked) // Just adjusted the opcode.
3651  return Load;
3652 
3653  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3654 
3655  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3656 }
3657 
3659  SDNode *N, SelectionDAG &DAG) {
3660  EVT VT = N->getValueType(0);
3661  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3662  if (!CD)
3663  return DAG.getUNDEF(VT);
3664 
3665  int CondCode = CD->getSExtValue();
3666  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3667  CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3668  return DAG.getUNDEF(VT);
3669 
3670  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3671 
3672 
3673  SDValue LHS = N->getOperand(1);
3674  SDValue RHS = N->getOperand(2);
3675 
3676  SDLoc DL(N);
3677 
3678  EVT CmpVT = LHS.getValueType();
3679  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3680  unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3682  LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3683  RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3684  }
3685 
3686  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3687 
3688  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3689  DAG.getCondCode(CCOpcode));
3690 }
3691 
3693  SDNode *N, SelectionDAG &DAG) {
3694  EVT VT = N->getValueType(0);
3695  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3696  if (!CD)
3697  return DAG.getUNDEF(VT);
3698 
3699  int CondCode = CD->getSExtValue();
3700  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3701  CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3702  return DAG.getUNDEF(VT);
3703  }
3704 
3705  SDValue Src0 = N->getOperand(1);
3706  SDValue Src1 = N->getOperand(2);
3707  EVT CmpVT = Src0.getValueType();
3708  SDLoc SL(N);
3709 
3710  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3711  Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3712  Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3713  }
3714 
3715  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3716  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3717  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3718  Src1, DAG.getCondCode(CCOpcode));
3719 }
3720 
3723  SelectionDAG &DAG) const {
3724  switch (N->getOpcode()) {
3725  case ISD::INSERT_VECTOR_ELT: {
3726  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3727  Results.push_back(Res);
3728  return;
3729  }
3730  case ISD::EXTRACT_VECTOR_ELT: {
3731  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3732  Results.push_back(Res);
3733  return;
3734  }
3735  case ISD::INTRINSIC_WO_CHAIN: {
3736  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3737  switch (IID) {
3738  case Intrinsic::amdgcn_cvt_pkrtz: {
3739  SDValue Src0 = N->getOperand(1);
3740  SDValue Src1 = N->getOperand(2);
3741  SDLoc SL(N);
3743  Src0, Src1);
3744  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3745  return;
3746  }
3747  case Intrinsic::amdgcn_cvt_pknorm_i16:
3748  case Intrinsic::amdgcn_cvt_pknorm_u16:
3749  case Intrinsic::amdgcn_cvt_pk_i16:
3750  case Intrinsic::amdgcn_cvt_pk_u16: {
3751  SDValue Src0 = N->getOperand(1);
3752  SDValue Src1 = N->getOperand(2);
3753  SDLoc SL(N);
3754  unsigned Opcode;
3755 
3756  if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3758  else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3760  else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3761  Opcode = AMDGPUISD::CVT_PK_I16_I32;
3762  else
3763  Opcode = AMDGPUISD::CVT_PK_U16_U32;
3764 
3765  EVT VT = N->getValueType(0);
3766  if (isTypeLegal(VT))
3767  Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3768  else {
3769  SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3770  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3771  }
3772  return;
3773  }
3774  }
3775  break;
3776  }
3777  case ISD::INTRINSIC_W_CHAIN: {
3778  if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3779  Results.push_back(Res);
3780  Results.push_back(Res.getValue(1));
3781  return;
3782  }
3783 
3784  break;
3785  }
3786  case ISD::SELECT: {
3787  SDLoc SL(N);
3788  EVT VT = N->getValueType(0);
3789  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3790  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3791  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3792 
3793  EVT SelectVT = NewVT;
3794  if (NewVT.bitsLT(MVT::i32)) {
3795  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3796  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3797  SelectVT = MVT::i32;
3798  }
3799 
3800  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3801  N->getOperand(0), LHS, RHS);
3802 
3803  if (NewVT != SelectVT)
3804  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3805  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3806  return;
3807  }
3808  case ISD::FNEG: {
3809  if (N->getValueType(0) != MVT::v2f16)
3810  break;
3811 
3812  SDLoc SL(N);
3813  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3814 
3815  SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3816  BC,
3817  DAG.getConstant(0x80008000, SL, MVT::i32));
3818  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3819  return;
3820  }
3821  case ISD::FABS: {
3822  if (N->getValueType(0) != MVT::v2f16)
3823  break;
3824 
3825  SDLoc SL(N);
3826  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3827 
3828  SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3829  BC,
3830  DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3831  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3832  return;
3833  }
3834  default:
3835  break;
3836  }
3837 }
3838 
3839 /// Helper function for LowerBRCOND
3840 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3841 
3842  SDNode *Parent = Value.getNode();
3843  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3844  I != E; ++I) {
3845 
3846  if (I.getUse().get() != Value)
3847  continue;
3848 
3849  if (I->getOpcode() == Opcode)
3850  return *I;
3851  }
3852  return nullptr;
3853 }
3854 
3855 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3856  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3857  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3858  case Intrinsic::amdgcn_if:
3859  return AMDGPUISD::IF;
3860  case Intrinsic::amdgcn_else:
3861  return AMDGPUISD::ELSE;
3862  case Intrinsic::amdgcn_loop:
3863  return AMDGPUISD::LOOP;
3864  case Intrinsic::amdgcn_end_cf:
3865  llvm_unreachable("should not occur");
3866  default:
3867  return 0;
3868  }
3869  }
3870 
3871  // break, if_break, else_break are all only used as inputs to loop, not
3872  // directly as branch conditions.
3873  return 0;
3874 }
3875 
3876 void SITargetLowering::createDebuggerPrologueStackObjects(
3877  MachineFunction &MF) const {
3878  // Create stack objects that are used for emitting debugger prologue.
3879  //
3880  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3881  // at fixed location in the following format:
3882  // offset 0: work group ID x
3883  // offset 4: work group ID y
3884  // offset 8: work group ID z
3885  // offset 16: work item ID x
3886  // offset 20: work item ID y
3887  // offset 24: work item ID z
3889  int ObjectIdx = 0;
3890 
3891  // For each dimension:
3892  for (unsigned i = 0; i < 3; ++i) {
3893  // Create fixed stack object for work group ID.
3894  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3895  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3896  // Create fixed stack object for work item ID.
3897  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3898  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3899  }
3900 }
3901 
3902 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3903  const Triple &TT = getTargetMachine().getTargetTriple();
3904  return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3907 }
3908 
3909 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3910  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
3913  !shouldEmitFixup(GV) &&
3915 }
3916 
3917 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3918  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3919 }
3920 
3921 /// This transforms the control flow intrinsics to get the branch destination as
3922 /// last parameter, also switches branch target with BR if the need arise
3923 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3924  SelectionDAG &DAG) const {
3925  SDLoc DL(BRCOND);
3926 
3927  SDNode *Intr = BRCOND.getOperand(1).getNode();
3928  SDValue Target = BRCOND.getOperand(2);
3929  SDNode *BR = nullptr;
3930  SDNode *SetCC = nullptr;
3931 
3932  if (Intr->getOpcode() == ISD::SETCC) {
3933  // As long as we negate the condition everything is fine
3934  SetCC = Intr;
3935  Intr = SetCC->getOperand(0).getNode();
3936 
3937  } else {
3938  // Get the target from BR if we don't negate the condition
3939  BR = findUser(BRCOND, ISD::BR);
3940  Target = BR->getOperand(1);
3941  }
3942 
3943  // FIXME: This changes the types of the intrinsics instead of introducing new
3944  // nodes with the correct types.
3945  // e.g. llvm.amdgcn.loop
3946 
3947  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3948  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3949 
3950  unsigned CFNode = isCFIntrinsic(Intr);
3951  if (CFNode == 0) {
3952  // This is a uniform branch so we don't need to legalize.
3953  return BRCOND;
3954  }
3955 
3956  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3957  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3958 
3959  assert(!SetCC ||
3960  (SetCC->getConstantOperandVal(1) == 1 &&
3961  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3962  ISD::SETNE));
3963 
3964  // operands of the new intrinsic call
3966  if (HaveChain)
3967  Ops.push_back(BRCOND.getOperand(0));
3968 
3969  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
3970  Ops.push_back(Target);
3971 
3972  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3973 
3974  // build the new intrinsic call
3975  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3976 
3977  if (!HaveChain) {
3978  SDValue Ops[] = {
3979  SDValue(Result, 0),
3980  BRCOND.getOperand(0)
3981  };
3982 
3983  Result = DAG.getMergeValues(Ops, DL).getNode();
3984  }
3985 
3986  if (BR) {
3987  // Give the branch instruction our target
3988  SDValue Ops[] = {
3989  BR->getOperand(0),
3990  BRCOND.getOperand(2)
3991  };
3992  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3993  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3994  BR = NewBR.getNode();
3995  }
3996 
3997  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3998 
3999  // Copy the intrinsic results to registers
4000  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4002  if (!CopyToReg)
4003  continue;
4004 
4005  Chain = DAG.getCopyToReg(
4006  Chain, DL,
4007  CopyToReg->getOperand(1),
4008  SDValue(Result, i - 1),
4009  SDValue());
4010 
4011  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4012  }
4013 
4014  // Remove the old intrinsic from the chain
4016  SDValue(Intr, Intr->getNumValues() - 1),
4017  Intr->getOperand(0));
4018 
4019  return Chain;
4020 }
4021 
4022 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4023  SDValue Op,
4024  const SDLoc &DL,
4025  EVT VT) const {
4026  return Op.getValueType().bitsLE(VT) ?
4027  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4028  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4029 }
4030 
4031 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4032  assert(Op.getValueType() == MVT::f16 &&
4033  "Do not know how to custom lower FP_ROUND for non-f16 type");
4034 
4035  SDValue Src = Op.getOperand(0);
4036  EVT SrcVT = Src.getValueType();
4037  if (SrcVT != MVT::f64)
4038  return Op;
4039 
4040  SDLoc DL(Op);
4041 
4042  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4043  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4044  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4045 }
4046 
4047 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4048  SDLoc SL(Op);
4049  SDValue Chain = Op.getOperand(0);
4050 
4051  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4052  !Subtarget->isTrapHandlerEnabled())
4053  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4054 
4055  MachineFunction &MF = DAG.getMachineFunction();
4057  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4058  assert(UserSGPR != AMDGPU::NoRegister);
4059  SDValue QueuePtr = CreateLiveInRegister(
4060  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4061  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4062  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4063  QueuePtr, SDValue());
4064  SDValue Ops[] = {
4065  ToReg,
4067  SGPR01,
4068  ToReg.getValue(1)
4069  };
4070  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4071 }
4072 
4073 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4074  SDLoc SL(Op);
4075  SDValue Chain = Op.getOperand(0);
4076  MachineFunction &MF = DAG.getMachineFunction();
4077 
4078  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4079  !Subtarget->isTrapHandlerEnabled()) {
4081  "debugtrap handler not supported",
4082  Op.getDebugLoc(),
4083  DS_Warning);
4084  LLVMContext &Ctx = MF.getFunction().getContext();
4085  Ctx.diagnose(NoTrap);
4086  return Chain;
4087  }
4088 
4089  SDValue Ops[] = {
4090  Chain,
4092  };
4093  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4094 }
4095 
4096 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4097  SelectionDAG &DAG) const {
4098  // FIXME: Use inline constants (src_{shared, private}_base) instead.
4099  if (Subtarget->hasApertureRegs()) {
4100  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4103  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4106  unsigned Encoding =
4108  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4109  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4110 
4111  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4112  SDValue ApertureReg = SDValue(
4113  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4114  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4115  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4116  }
4117 
4118  MachineFunction &MF = DAG.getMachineFunction();
4120  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4121  assert(UserSGPR != AMDGPU::NoRegister);
4122 
4123  SDValue QueuePtr = CreateLiveInRegister(
4124  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4125 
4126  // Offset into amd_queue_t for group_segment_aperture_base_hi /
4127  // private_segment_aperture_base_hi.
4128  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4129 
4130  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4131 
4132  // TODO: Use custom target PseudoSourceValue.
4133  // TODO: We should use the value from the IR intrinsic call, but it might not
4134  // be available and how do we get it?
4137 
4138  MachinePointerInfo PtrInfo(V, StructOffset);
4139  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4140  MinAlign(64, StructOffset),
4143 }
4144 
4145 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4146  SelectionDAG &DAG) const {
4147  SDLoc SL(Op);
4148  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4149 
4150  SDValue Src = ASC->getOperand(0);
4151  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4152 
4153  const AMDGPUTargetMachine &TM =
4154  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4155 
4156  // flat -> local/private
4158  unsigned DestAS = ASC->getDestAddressSpace();
4159 
4160  if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4161  DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4162  unsigned NullVal = TM.getNullPointerValue(DestAS);
4163  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4164  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4165  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4166 
4167  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4168  NonNull, Ptr, SegmentNullPtr);
4169  }
4170  }
4171 
4172  // local/private -> flat
4174  unsigned SrcAS = ASC->getSrcAddressSpace();
4175 
4176  if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4177  SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4178  unsigned NullVal = TM.getNullPointerValue(SrcAS);
4179  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4180 
4181  SDValue NonNull
4182  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4183 
4184  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4185  SDValue CvtPtr
4186  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4187 
4188  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4189  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4190  FlatNullPtr);
4191  }
4192  }
4193 
4194  // global <-> flat are no-ops and never emitted.
4195 
4196  const MachineFunction &MF = DAG.getMachineFunction();
4197  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4198  MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4199  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4200 
4201  return DAG.getUNDEF(ASC->getValueType(0));
4202 }
4203 
4204 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4205  SelectionDAG &DAG) const {
4206  SDValue Vec = Op.getOperand(0);
4207  SDValue InsVal = Op.getOperand(1);
4208  SDValue Idx = Op.getOperand(2);
4209  EVT VecVT = Vec.getValueType();
4210  EVT EltVT = VecVT.getVectorElementType();
4211  unsigned VecSize = VecVT.getSizeInBits();
4212  unsigned EltSize = EltVT.getSizeInBits();
4213 
4214 
4215  assert(VecSize <= 64);
4216 
4217  unsigned NumElts = VecVT.getVectorNumElements();
4218  SDLoc SL(Op);
4219  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4220 
4221  if (NumElts == 4 && EltSize == 16 && KIdx) {
4222  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4223 
4224  SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4225  DAG.getConstant(0, SL, MVT::i32));
4226  SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4227  DAG.getConstant(1, SL, MVT::i32));
4228 
4229  SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4230  SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4231 
4232  unsigned Idx = KIdx->getZExtValue();
4233  bool InsertLo = Idx < 2;
4234  SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4235  InsertLo ? LoVec : HiVec,
4236  DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4237  DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4238 
4239  InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4240 
4241  SDValue Concat = InsertLo ?
4242  DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4243  DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4244 
4245  return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4246  }
4247 
4248  if (isa<ConstantSDNode>(Idx))
4249  return SDValue();
4250 
4251  MVT IntVT = MVT::getIntegerVT(VecSize);
4252 
4253  // Avoid stack access for dynamic indexing.
4254  SDValue Val = InsVal;
4255  if (InsVal.getValueType() == MVT::f16)
4256  Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4257 
4258  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4259  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4260 
4261  assert(isPowerOf2_32(EltSize));
4262  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4263 
4264  // Convert vector index to bit-index.
4265  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4266 
4267  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4268  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4269  DAG.getConstant(0xffff, SL, IntVT),
4270  ScaledIdx);
4271 
4272  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4273  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4274  DAG.getNOT(SL, BFM, IntVT), BCVec);
4275 
4276  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4277  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4278 }
4279 
4280 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4281  SelectionDAG &DAG) const {
4282  SDLoc SL(Op);
4283 
4284  EVT ResultVT = Op.getValueType();
4285  SDValue Vec = Op.getOperand(0);
4286  SDValue Idx = Op.getOperand(1);
4287  EVT VecVT = Vec.getValueType();
4288  unsigned VecSize = VecVT.getSizeInBits();
4289  EVT EltVT = VecVT.getVectorElementType();
4290  assert(VecSize <= 64);
4291 
4292  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4293 
4294  // Make sure we do any optimizations that will make it easier to fold
4295  // source modifiers before obscuring it with bit operations.
4296 
4297  // XXX - Why doesn't this get called when vector_shuffle is expanded?
4298  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4299  return Combined;
4300 
4301  unsigned EltSize = EltVT.getSizeInBits();
4302  assert(isPowerOf2_32(EltSize));
4303 
4304  MVT IntVT = MVT::getIntegerVT(VecSize);
4305  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4306 
4307  // Convert vector index to bit-index (* EltSize)
4308  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4309 
4310  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4311  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4312 
4313  if (ResultVT == MVT::f16) {
4314  SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4315  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4316  }
4317 
4318  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4319 }
4320 
4321 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4322  SelectionDAG &DAG) const {
4323  SDLoc SL(Op);
4324  EVT VT = Op.getValueType();
4325 
4326  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4328 
4329  // Turn into pair of packed build_vectors.
4330  // TODO: Special case for constants that can be materialized with s_mov_b64.
4331  SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4332  { Op.getOperand(0), Op.getOperand(1) });
4333  SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4334  { Op.getOperand(2), Op.getOperand(3) });
4335 
4336  SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4337  SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4338 
4339  SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4340  return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4341  }
4342 
4343  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4344  assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
4345 
4346  SDValue Lo = Op.getOperand(0);
4347  SDValue Hi = Op.getOperand(1);
4348 
4349  // Avoid adding defined bits with the zero_extend.
4350  if (Hi.isUndef()) {
4351  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4352  SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4353  return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4354  }
4355 
4356  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4357  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4358 
4359  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4360  DAG.getConstant(16, SL, MVT::i32));
4361  if (Lo.isUndef())
4362  return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4363 
4364  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4365  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4366 
4367  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4368  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4369 }
4370 
4371 bool
4373  // We can fold offsets for anything that doesn't require a GOT relocation.
4374  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4377  !shouldEmitGOTReloc(GA->getGlobal());
4378 }
4379 
4380 static SDValue
4382  const SDLoc &DL, unsigned Offset, EVT PtrVT,
4383  unsigned GAFlags = SIInstrInfo::MO_NONE) {
4384  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4385  // lowered to the following code sequence:
4386  //
4387  // For constant address space:
4388  // s_getpc_b64 s[0:1]
4389  // s_add_u32 s0, s0, $symbol
4390  // s_addc_u32 s1, s1, 0
4391  //
4392  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4393  // a fixup or relocation is emitted to replace $symbol with a literal
4394  // constant, which is a pc-relative offset from the encoding of the $symbol
4395  // operand to the global variable.
4396  //
4397  // For global address space:
4398  // s_getpc_b64 s[0:1]
4399  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4400  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4401  //
4402  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4403  // fixups or relocations are emitted to replace $symbol@*@lo and
4404  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4405  // which is a 64-bit pc-relative offset from the encoding of the $symbol
4406  // operand to the global variable.
4407  //
4408  // What we want here is an offset from the value returned by s_getpc
4409  // (which is the address of the s_add_u32 instruction) to the global
4410  // variable, but since the encoding of $symbol starts 4 bytes after the start
4411  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4412  // small. This requires us to add 4 to the global variable offset in order to
4413  // compute the correct address.
4414  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4415  GAFlags);
4416  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4417  GAFlags == SIInstrInfo::MO_NONE ?
4418  GAFlags : GAFlags + 1);
4419  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4420 }
4421 
4422 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4423  SDValue Op,
4424  SelectionDAG &DAG) const {
4425  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4426  const GlobalValue *GV = GSD->getGlobal();
4427  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4430  return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4431 
4432  SDLoc DL(GSD);
4433  EVT PtrVT = Op.getValueType();
4434 
4435  // FIXME: Should not make address space based decisions here.
4436  if (shouldEmitFixup(GV))
4437  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4438  else if (shouldEmitPCReloc(GV))
4439  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4441 
4442  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4444 
4445  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4447  const DataLayout &DataLayout = DAG.getDataLayout();
4448  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4449  MachinePointerInfo PtrInfo
4451 
4452  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4455 }
4456</