LLVM  9.0.0svn
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // Provide M_PI.
16 #define _USE_MATH_DEFINES
17 #endif
18 
19 #include "SIISelLowering.h"
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
71 #include "llvm/Support/Compiler.h"
73 #include "llvm/Support/KnownBits.h"
77 #include <cassert>
78 #include <cmath>
79 #include <cstdint>
80 #include <iterator>
81 #include <tuple>
82 #include <utility>
83 #include <vector>
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-lower"
88 
89 STATISTIC(NumTailCalls, "Number of tail calls");
90 
92  "amdgpu-vgpr-index-mode",
93  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94  cl::init(false));
95 
97  "amdgpu-frame-index-zero-bits",
98  cl::desc("High bits of frame index assumed to be zero"),
99  cl::init(5),
101 
102 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
103  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
104  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
105  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
106  return AMDGPU::SGPR0 + Reg;
107  }
108  }
109  llvm_unreachable("Cannot allocate sgpr");
110 }
111 
113  const GCNSubtarget &STI)
114  : AMDGPUTargetLowering(TM, STI),
115  Subtarget(&STI) {
116  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
117  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
118 
119  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
120  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
121 
122  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
123  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
124  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
125 
126  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
127  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
128 
129  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
130  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
131 
132  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
133  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
134 
135  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
136  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
137 
138  if (Subtarget->has16BitInsts()) {
139  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
140  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
141 
142  // Unless there are also VOP3P operations, not operations are really legal.
143  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
144  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
145  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
146  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
147  }
148 
150 
151  // We need to custom lower vector stores from local memory
158 
165 
176 
179 
184 
190 
195 
198 
206 
214 
219 
224 
231 
234 
237 
241 
242 #if 0
245 #endif
246 
247  // We only support LOAD/STORE and vector manipulation ops for vectors
248  // with > 4 elements.
251  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
252  switch (Op) {
253  case ISD::LOAD:
254  case ISD::STORE:
255  case ISD::BUILD_VECTOR:
256  case ISD::BITCAST:
262  break;
263  case ISD::CONCAT_VECTORS:
265  break;
266  default:
268  break;
269  }
270  }
271  }
272 
274 
275  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
276  // is expanded to avoid having two separate loops in case the index is a VGPR.
277 
278  // Most operations are naturally 32-bit vector operations. We only support
279  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
280  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
283 
286 
289 
292  }
293 
298 
301 
302  // Avoid stack access for these.
303  // TODO: Generalize to more vector types.
308 
314 
318 
323 
324  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
325  // and output demarshalling
328 
329  // We can't return success/failure, only the old value,
330  // let LLVM add the comparison
333 
334  if (Subtarget->hasFlatAddressSpace()) {
337  }
338 
341 
342  // On SI this is s_memtime and s_memrealtime on VI.
346 
347  if (Subtarget->has16BitInsts()) {
351  }
352 
353  // v_mad_f32 does not support denormals according to some sources.
354  if (!Subtarget->hasFP32Denormals())
356 
357  if (!Subtarget->hasBFI()) {
358  // fcopysign can be done in a single instruction with BFI.
361  }
362 
363  if (!Subtarget->hasBCNT(32))
365 
366  if (!Subtarget->hasBCNT(64))
368 
369  if (Subtarget->hasFFBH())
371 
372  if (Subtarget->hasFFBL())
374 
375  // We only really have 32-bit BFE instructions (and 16-bit on VI).
376  //
377  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
378  // effort to match them now. We want this to be false for i64 cases when the
379  // extraction isn't restricted to the upper or lower half. Ideally we would
380  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
381  // span the midpoint are probably relatively rare, so don't worry about them
382  // for now.
383  if (Subtarget->hasBFE())
384  setHasExtractBitsInsn(true);
385 
390 
391 
392  // These are really only legal for ieee_mode functions. We should be avoiding
393  // them for functions that don't have ieee_mode enabled, so just say they are
394  // legal.
399 
400 
401  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
405  } else {
410  }
411 
413 
418 
419  if (Subtarget->has16BitInsts()) {
421 
424 
427 
430 
433 
438 
441 
447 
449 
451 
453 
455 
460 
465 
466  // F16 - Constant Actions.
468 
469  // F16 - Load/Store Actions.
474 
475  // F16 - VOP1 Actions.
484 
485  // F16 - VOP2 Actions.
488 
490 
491  // F16 - VOP3 Actions.
493  if (!Subtarget->hasFP16Denormals())
495 
496  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
497  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
498  switch (Op) {
499  case ISD::LOAD:
500  case ISD::STORE:
501  case ISD::BUILD_VECTOR:
502  case ISD::BITCAST:
508  break;
509  case ISD::CONCAT_VECTORS:
511  break;
512  default:
514  break;
515  }
516  }
517  }
518 
519  // XXX - Do these do anything? Vector constants turn into build_vector.
522 
525 
530 
535 
542 
547 
552 
557 
561 
562  if (!Subtarget->hasVOP3PInsts()) {
565  }
566 
568  // This isn't really legal, but this avoids the legalizer unrolling it (and
569  // allows matching fneg (fabs x) patterns)
571 
576 
579 
582  }
583 
584  if (Subtarget->hasVOP3PInsts()) {
595 
599 
602 
604 
607 
614 
619 
622 
625 
629 
633  }
634 
637 
638  if (Subtarget->has16BitInsts()) {
643  } else {
644  // Legalization hack.
647 
650  }
651 
654  }
655 
682 
683  // All memory operations. Some folding on the pointer operand is done to help
684  // matching the constant offsets in the addressing modes.
702 
704 
705  // SI at least has hardware support for floating point exceptions, but no way
706  // of using or handling them is implemented. They are also optional in OpenCL
707  // (Section 7.3)
709 }
710 
712  return Subtarget;
713 }
714 
715 //===----------------------------------------------------------------------===//
716 // TargetLowering queries
717 //===----------------------------------------------------------------------===//
718 
719 // v_mad_mix* support a conversion from f16 to f32.
720 //
721 // There is only one special case when denormals are enabled we don't currently,
722 // where this is OK to use.
724  EVT DestVT, EVT SrcVT) const {
725  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
726  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
727  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
728  SrcVT.getScalarType() == MVT::f16;
729 }
730 
732  // SI has some legal vector types, but no legal vector operations. Say no
733  // shuffles are legal in order to prefer scalarizing some vector operations.
734  return false;
735 }
736 
738  CallingConv::ID CC,
739  EVT VT) const {
740  // TODO: Consider splitting all arguments into 32-bit pieces.
741  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
742  EVT ScalarVT = VT.getScalarType();
743  unsigned Size = ScalarVT.getSizeInBits();
744  if (Size == 32)
745  return ScalarVT.getSimpleVT();
746 
747  if (Size == 64)
748  return MVT::i32;
749 
750  if (Size == 16 && Subtarget->has16BitInsts())
751  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
752  }
753 
754  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
755 }
756 
758  CallingConv::ID CC,
759  EVT VT) const {
760  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
761  unsigned NumElts = VT.getVectorNumElements();
762  EVT ScalarVT = VT.getScalarType();
763  unsigned Size = ScalarVT.getSizeInBits();
764 
765  if (Size == 32)
766  return NumElts;
767 
768  if (Size == 64)
769  return 2 * NumElts;
770 
771  if (Size == 16 && Subtarget->has16BitInsts())
772  return (VT.getVectorNumElements() + 1) / 2;
773  }
774 
775  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
776 }
777 
780  EVT VT, EVT &IntermediateVT,
781  unsigned &NumIntermediates, MVT &RegisterVT) const {
782  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
783  unsigned NumElts = VT.getVectorNumElements();
784  EVT ScalarVT = VT.getScalarType();
785  unsigned Size = ScalarVT.getSizeInBits();
786  if (Size == 32) {
787  RegisterVT = ScalarVT.getSimpleVT();
788  IntermediateVT = RegisterVT;
789  NumIntermediates = NumElts;
790  return NumIntermediates;
791  }
792 
793  if (Size == 64) {
794  RegisterVT = MVT::i32;
795  IntermediateVT = RegisterVT;
796  NumIntermediates = 2 * NumElts;
797  return NumIntermediates;
798  }
799 
800  // FIXME: We should fix the ABI to be the same on targets without 16-bit
801  // support, but unless we can properly handle 3-vectors, it will be still be
802  // inconsistent.
803  if (Size == 16 && Subtarget->has16BitInsts()) {
804  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
805  IntermediateVT = RegisterVT;
806  NumIntermediates = (NumElts + 1) / 2;
807  return NumIntermediates;
808  }
809  }
810 
812  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
813 }
814 
816  // Only limited forms of aggregate type currently expected.
817  assert(Ty->isStructTy() && "Expected struct type");
818 
819 
820  Type *ElementType = nullptr;
821  unsigned NumElts;
822  if (Ty->getContainedType(0)->isVectorTy()) {
823  VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
824  ElementType = VecComponent->getElementType();
825  NumElts = VecComponent->getNumElements();
826  } else {
827  ElementType = Ty->getContainedType(0);
828  NumElts = 1;
829  }
830 
831  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
832 
833  // Calculate the size of the memVT type from the aggregate
834  unsigned Pow2Elts = 0;
835  unsigned ElementSize;
836  switch (ElementType->getTypeID()) {
837  default:
838  llvm_unreachable("Unknown type!");
839  case Type::IntegerTyID:
840  ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
841  break;
842  case Type::HalfTyID:
843  ElementSize = 16;
844  break;
845  case Type::FloatTyID:
846  ElementSize = 32;
847  break;
848  }
849  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
850  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
851 
852  return MVT::getVectorVT(MVT::getVT(ElementType, false),
853  Pow2Elts);
854 }
855 
857  const CallInst &CI,
858  MachineFunction &MF,
859  unsigned IntrID) const {
860  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
861  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
863  (Intrinsic::ID)IntrID);
864  if (Attr.hasFnAttribute(Attribute::ReadNone))
865  return false;
866 
868 
869  if (RsrcIntr->IsImage) {
870  Info.ptrVal = MFI->getImagePSV(
872  CI.getArgOperand(RsrcIntr->RsrcArg));
873  Info.align = 0;
874  } else {
875  Info.ptrVal = MFI->getBufferPSV(
877  CI.getArgOperand(RsrcIntr->RsrcArg));
878  }
879 
881  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
883  Info.memVT = MVT::getVT(CI.getType(), true);
884  if (Info.memVT == MVT::Other) {
885  // Some intrinsics return an aggregate type - special case to work out
886  // the correct memVT
887  Info.memVT = memVTFromAggregate(CI.getType());
888  }
890  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
891  Info.opc = ISD::INTRINSIC_VOID;
892  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
894  } else {
895  // Atomic
897  Info.memVT = MVT::getVT(CI.getType());
901 
902  // XXX - Should this be volatile without known ordering?
904  }
905  return true;
906  }
907 
908  switch (IntrID) {
909  case Intrinsic::amdgcn_atomic_inc:
910  case Intrinsic::amdgcn_atomic_dec:
911  case Intrinsic::amdgcn_ds_ordered_add:
912  case Intrinsic::amdgcn_ds_ordered_swap:
913  case Intrinsic::amdgcn_ds_fadd:
914  case Intrinsic::amdgcn_ds_fmin:
915  case Intrinsic::amdgcn_ds_fmax: {
917  Info.memVT = MVT::getVT(CI.getType());
918  Info.ptrVal = CI.getOperand(0);
919  Info.align = 0;
921 
922  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
923  if (!Vol || !Vol->isZero())
925 
926  return true;
927  }
928 
929  default:
930  return false;
931  }
932 }
933 
936  Type *&AccessTy) const {
937  switch (II->getIntrinsicID()) {
938  case Intrinsic::amdgcn_atomic_inc:
939  case Intrinsic::amdgcn_atomic_dec:
940  case Intrinsic::amdgcn_ds_ordered_add:
941  case Intrinsic::amdgcn_ds_ordered_swap:
942  case Intrinsic::amdgcn_ds_fadd:
943  case Intrinsic::amdgcn_ds_fmin:
944  case Intrinsic::amdgcn_ds_fmax: {
945  Value *Ptr = II->getArgOperand(0);
946  AccessTy = II->getType();
947  Ops.push_back(Ptr);
948  return true;
949  }
950  default:
951  return false;
952  }
953 }
954 
955 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
956  if (!Subtarget->hasFlatInstOffsets()) {
957  // Flat instructions do not have offsets, and only have the register
958  // address.
959  return AM.BaseOffs == 0 && AM.Scale == 0;
960  }
961 
962  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
963  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
964 
965  // Just r + i
966  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
967 }
968 
970  if (Subtarget->hasFlatGlobalInsts())
971  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
972 
973  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
974  // Assume the we will use FLAT for all global memory accesses
975  // on VI.
976  // FIXME: This assumption is currently wrong. On VI we still use
977  // MUBUF instructions for the r + i addressing mode. As currently
978  // implemented, the MUBUF instructions only work on buffer < 4GB.
979  // It may be possible to support > 4GB buffers with MUBUF instructions,
980  // by setting the stride value in the resource descriptor which would
981  // increase the size limit to (stride * 4GB). However, this is risky,
982  // because it has never been validated.
983  return isLegalFlatAddressingMode(AM);
984  }
985 
986  return isLegalMUBUFAddressingMode(AM);
987 }
988 
989 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
990  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
991  // additionally can do r + r + i with addr64. 32-bit has more addressing
992  // mode options. Depending on the resource constant, it can also do
993  // (i64 r0) + (i32 r1) * (i14 i).
994  //
995  // Private arrays end up using a scratch buffer most of the time, so also
996  // assume those use MUBUF instructions. Scratch loads / stores are currently
997  // implemented as mubuf instructions with offen bit set, so slightly
998  // different than the normal addr64.
999  if (!isUInt<12>(AM.BaseOffs))
1000  return false;
1001 
1002  // FIXME: Since we can split immediate into soffset and immediate offset,
1003  // would it make sense to allow any immediate?
1004 
1005  switch (AM.Scale) {
1006  case 0: // r + i or just i, depending on HasBaseReg.
1007  return true;
1008  case 1:
1009  return true; // We have r + r or r + i.
1010  case 2:
1011  if (AM.HasBaseReg) {
1012  // Reject 2 * r + r.
1013  return false;
1014  }
1015 
1016  // Allow 2 * r as r + r
1017  // Or 2 * r + i is allowed as r + r + i.
1018  return true;
1019  default: // Don't allow n * r
1020  return false;
1021  }
1022 }
1023 
1025  const AddrMode &AM, Type *Ty,
1026  unsigned AS, Instruction *I) const {
1027  // No global is ever allowed as a base.
1028  if (AM.BaseGV)
1029  return false;
1030 
1031  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1032  return isLegalGlobalAddressingMode(AM);
1033 
1034  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1036  // If the offset isn't a multiple of 4, it probably isn't going to be
1037  // correctly aligned.
1038  // FIXME: Can we get the real alignment here?
1039  if (AM.BaseOffs % 4 != 0)
1040  return isLegalMUBUFAddressingMode(AM);
1041 
1042  // There are no SMRD extloads, so if we have to do a small type access we
1043  // will use a MUBUF load.
1044  // FIXME?: We also need to do this if unaligned, but we don't know the
1045  // alignment here.
1046  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1047  return isLegalGlobalAddressingMode(AM);
1048 
1049  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1050  // SMRD instructions have an 8-bit, dword offset on SI.
1051  if (!isUInt<8>(AM.BaseOffs / 4))
1052  return false;
1053  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1054  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1055  // in 8-bits, it can use a smaller encoding.
1056  if (!isUInt<32>(AM.BaseOffs / 4))
1057  return false;
1058  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1059  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1060  if (!isUInt<20>(AM.BaseOffs))
1061  return false;
1062  } else
1063  llvm_unreachable("unhandled generation");
1064 
1065  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1066  return true;
1067 
1068  if (AM.Scale == 1 && AM.HasBaseReg)
1069  return true;
1070 
1071  return false;
1072 
1073  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1074  return isLegalMUBUFAddressingMode(AM);
1075  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1076  AS == AMDGPUAS::REGION_ADDRESS) {
1077  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1078  // field.
1079  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1080  // an 8-bit dword offset but we don't know the alignment here.
1081  if (!isUInt<16>(AM.BaseOffs))
1082  return false;
1083 
1084  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1085  return true;
1086 
1087  if (AM.Scale == 1 && AM.HasBaseReg)
1088  return true;
1089 
1090  return false;
1091  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1093  // For an unknown address space, this usually means that this is for some
1094  // reason being used for pure arithmetic, and not based on some addressing
1095  // computation. We don't have instructions that compute pointers with any
1096  // addressing modes, so treat them as having no offset like flat
1097  // instructions.
1098  return isLegalFlatAddressingMode(AM);
1099  } else {
1100  llvm_unreachable("unhandled address space");
1101  }
1102 }
1103 
1104 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1105  const SelectionDAG &DAG) const {
1106  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1107  return (MemVT.getSizeInBits() <= 4 * 32);
1108  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1109  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1110  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1111  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1112  return (MemVT.getSizeInBits() <= 2 * 32);
1113  }
1114  return true;
1115 }
1116 
1118  unsigned AddrSpace,
1119  unsigned Align,
1120  bool *IsFast) const {
1121  if (IsFast)
1122  *IsFast = false;
1123 
1124  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1125  // which isn't a simple VT.
1126  // Until MVT is extended to handle this, simply check for the size and
1127  // rely on the condition below: allow accesses if the size is a multiple of 4.
1128  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1129  VT.getStoreSize() > 16)) {
1130  return false;
1131  }
1132 
1133  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1134  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1135  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1136  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1137  // with adjacent offsets.
1138  bool AlignedBy4 = (Align % 4 == 0);
1139  if (IsFast)
1140  *IsFast = AlignedBy4;
1141 
1142  return AlignedBy4;
1143  }
1144 
1145  // FIXME: We have to be conservative here and assume that flat operations
1146  // will access scratch. If we had access to the IR function, then we
1147  // could determine if any private memory was used in the function.
1148  if (!Subtarget->hasUnalignedScratchAccess() &&
1149  (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1150  AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1151  bool AlignedBy4 = Align >= 4;
1152  if (IsFast)
1153  *IsFast = AlignedBy4;
1154 
1155  return AlignedBy4;
1156  }
1157 
1158  if (Subtarget->hasUnalignedBufferAccess()) {
1159  // If we have an uniform constant load, it still requires using a slow
1160  // buffer instruction if unaligned.
1161  if (IsFast) {
1162  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1163  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1164  (Align % 4 == 0) : true;
1165  }
1166 
1167  return true;
1168  }
1169 
1170  // Smaller than dword value must be aligned.
1171  if (VT.bitsLT(MVT::i32))
1172  return false;
1173 
1174  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1175  // byte-address are ignored, thus forcing Dword alignment.
1176  // This applies to private, global, and constant memory.
1177  if (IsFast)
1178  *IsFast = true;
1179 
1180  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1181 }
1182 
1183 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1184  unsigned SrcAlign, bool IsMemset,
1185  bool ZeroMemset,
1186  bool MemcpyStrSrc,
1187  MachineFunction &MF) const {
1188  // FIXME: Should account for address space here.
1189 
1190  // The default fallback uses the private pointer size as a guess for a type to
1191  // use. Make sure we switch these to 64-bit accesses.
1192 
1193  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1194  return MVT::v4i32;
1195 
1196  if (Size >= 8 && DstAlign >= 4)
1197  return MVT::v2i32;
1198 
1199  // Use the default.
1200  return MVT::Other;
1201 }
1202 
1203 static bool isFlatGlobalAddrSpace(unsigned AS) {
1204  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1205  AS == AMDGPUAS::FLAT_ADDRESS ||
1207 }
1208 
1210  unsigned DestAS) const {
1211  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1212 }
1213 
1215  const MemSDNode *MemNode = cast<MemSDNode>(N);
1216  const Value *Ptr = MemNode->getMemOperand()->getValue();
1217  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1218  return I && I->getMetadata("amdgpu.noclobber");
1219 }
1220 
1222  unsigned DestAS) const {
1223  // Flat -> private/local is a simple truncate.
1224  // Flat -> global is no-op
1225  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1226  return true;
1227 
1228  return isNoopAddrSpaceCast(SrcAS, DestAS);
1229 }
1230 
1232  const MemSDNode *MemNode = cast<MemSDNode>(N);
1233 
1234  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1235 }
1236 
1239  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1240  return TypeSplitVector;
1241 
1243 }
1244 
1246  Type *Ty) const {
1247  // FIXME: Could be smarter if called for vector constants.
1248  return true;
1249 }
1250 
1252  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1253  switch (Op) {
1254  case ISD::LOAD:
1255  case ISD::STORE:
1256 
1257  // These operations are done with 32-bit instructions anyway.
1258  case ISD::AND:
1259  case ISD::OR:
1260  case ISD::XOR:
1261  case ISD::SELECT:
1262  // TODO: Extensions?
1263  return true;
1264  default:
1265  return false;
1266  }
1267  }
1268 
1269  // SimplifySetCC uses this function to determine whether or not it should
1270  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1271  if (VT == MVT::i1 && Op == ISD::SETCC)
1272  return false;
1273 
1274  return TargetLowering::isTypeDesirableForOp(Op, VT);
1275 }
1276 
1277 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1278  const SDLoc &SL,
1279  SDValue Chain,
1280  uint64_t Offset) const {
1281  const DataLayout &DL = DAG.getDataLayout();
1282  MachineFunction &MF = DAG.getMachineFunction();
1284 
1285  const ArgDescriptor *InputPtrReg;
1286  const TargetRegisterClass *RC;
1287 
1288  std::tie(InputPtrReg, RC)
1290 
1293  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1294  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1295 
1296  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1297 }
1298 
1299 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1300  const SDLoc &SL) const {
1301  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1302  FIRST_IMPLICIT);
1303  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1304 }
1305 
1306 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1307  const SDLoc &SL, SDValue Val,
1308  bool Signed,
1309  const ISD::InputArg *Arg) const {
1310  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1311  VT.bitsLT(MemVT)) {
1312  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1313  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1314  }
1315 
1316  if (MemVT.isFloatingPoint())
1317  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1318  else if (Signed)
1319  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1320  else
1321  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1322 
1323  return Val;
1324 }
1325 
1326 SDValue SITargetLowering::lowerKernargMemParameter(
1327  SelectionDAG &DAG, EVT VT, EVT MemVT,
1328  const SDLoc &SL, SDValue Chain,
1329  uint64_t Offset, unsigned Align, bool Signed,
1330  const ISD::InputArg *Arg) const {
1331  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1333  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1334 
1335  // Try to avoid using an extload by loading earlier than the argument address,
1336  // and extracting the relevant bits. The load should hopefully be merged with
1337  // the previous argument.
1338  if (MemVT.getStoreSize() < 4 && Align < 4) {
1339  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1340  int64_t AlignDownOffset = alignDown(Offset, 4);
1341  int64_t OffsetDiff = Offset - AlignDownOffset;
1342 
1343  EVT IntVT = MemVT.changeTypeToInteger();
1344 
1345  // TODO: If we passed in the base kernel offset we could have a better
1346  // alignment than 4, but we don't really need it.
1347  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1348  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1351 
1352  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1353  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1354 
1355  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1356  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1357  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1358 
1359 
1360  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1361  }
1362 
1363  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1364  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1367 
1368  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1369  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1370 }
1371 
1372 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1373  const SDLoc &SL, SDValue Chain,
1374  const ISD::InputArg &Arg) const {
1375  MachineFunction &MF = DAG.getMachineFunction();
1376  MachineFrameInfo &MFI = MF.getFrameInfo();
1377 
1378  if (Arg.Flags.isByVal()) {
1379  unsigned Size = Arg.Flags.getByValSize();
1380  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1381  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1382  }
1383 
1384  unsigned ArgOffset = VA.getLocMemOffset();
1385  unsigned ArgSize = VA.getValVT().getStoreSize();
1386 
1387  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1388 
1389  // Create load nodes to retrieve arguments from the stack.
1390  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1391  SDValue ArgValue;
1392 
1393  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1395  MVT MemVT = VA.getValVT();
1396 
1397  switch (VA.getLocInfo()) {
1398  default:
1399  break;
1400  case CCValAssign::BCvt:
1401  MemVT = VA.getLocVT();
1402  break;
1403  case CCValAssign::SExt:
1404  ExtType = ISD::SEXTLOAD;
1405  break;
1406  case CCValAssign::ZExt:
1407  ExtType = ISD::ZEXTLOAD;
1408  break;
1409  case CCValAssign::AExt:
1410  ExtType = ISD::EXTLOAD;
1411  break;
1412  }
1413 
1414  ArgValue = DAG.getExtLoad(
1415  ExtType, SL, VA.getLocVT(), Chain, FIN,
1417  MemVT);
1418  return ArgValue;
1419 }
1420 
1421 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1422  const SIMachineFunctionInfo &MFI,
1423  EVT VT,
1425  const ArgDescriptor *Reg;
1426  const TargetRegisterClass *RC;
1427 
1428  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1429  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1430 }
1431 
1433  CallingConv::ID CallConv,
1435  BitVector &Skipped,
1436  FunctionType *FType,
1438  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1439  const ISD::InputArg *Arg = &Ins[I];
1440 
1441  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1442  "vector type argument should have been split");
1443 
1444  // First check if it's a PS input addr.
1445  if (CallConv == CallingConv::AMDGPU_PS &&
1446  !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1447 
1448  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1449 
1450  // Inconveniently only the first part of the split is marked as isSplit,
1451  // so skip to the end. We only want to increment PSInputNum once for the
1452  // entire split argument.
1453  if (Arg->Flags.isSplit()) {
1454  while (!Arg->Flags.isSplitEnd()) {
1455  assert(!Arg->VT.isVector() &&
1456  "unexpected vector split in ps argument type");
1457  if (!SkipArg)
1458  Splits.push_back(*Arg);
1459  Arg = &Ins[++I];
1460  }
1461  }
1462 
1463  if (SkipArg) {
1464  // We can safely skip PS inputs.
1465  Skipped.set(Arg->getOrigArgIndex());
1466  ++PSInputNum;
1467  continue;
1468  }
1469 
1470  Info->markPSInputAllocated(PSInputNum);
1471  if (Arg->Used)
1472  Info->markPSInputEnabled(PSInputNum);
1473 
1474  ++PSInputNum;
1475  }
1476 
1477  Splits.push_back(*Arg);
1478  }
1479 }
1480 
1481 // Allocate special inputs passed in VGPRs.
1483  MachineFunction &MF,
1484  const SIRegisterInfo &TRI,
1486  if (Info.hasWorkItemIDX()) {
1487  unsigned Reg = AMDGPU::VGPR0;
1488  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1489 
1490  CCInfo.AllocateReg(Reg);
1492  }
1493 
1494  if (Info.hasWorkItemIDY()) {
1495  unsigned Reg = AMDGPU::VGPR1;
1496  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1497 
1498  CCInfo.AllocateReg(Reg);
1500  }
1501 
1502  if (Info.hasWorkItemIDZ()) {
1503  unsigned Reg = AMDGPU::VGPR2;
1504  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1505 
1506  CCInfo.AllocateReg(Reg);
1508  }
1509 }
1510 
1511 // Try to allocate a VGPR at the end of the argument list, or if no argument
1512 // VGPRs are left allocating a stack slot.
1514  ArrayRef<MCPhysReg> ArgVGPRs
1515  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1516  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1517  if (RegIdx == ArgVGPRs.size()) {
1518  // Spill to stack required.
1519  int64_t Offset = CCInfo.AllocateStack(4, 4);
1520 
1521  return ArgDescriptor::createStack(Offset);
1522  }
1523 
1524  unsigned Reg = ArgVGPRs[RegIdx];
1525  Reg = CCInfo.AllocateReg(Reg);
1526  assert(Reg != AMDGPU::NoRegister);
1527 
1528  MachineFunction &MF = CCInfo.getMachineFunction();
1529  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1530  return ArgDescriptor::createRegister(Reg);
1531 }
1532 
1534  const TargetRegisterClass *RC,
1535  unsigned NumArgRegs) {
1536  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1537  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1538  if (RegIdx == ArgSGPRs.size())
1539  report_fatal_error("ran out of SGPRs for arguments");
1540 
1541  unsigned Reg = ArgSGPRs[RegIdx];
1542  Reg = CCInfo.AllocateReg(Reg);
1543  assert(Reg != AMDGPU::NoRegister);
1544 
1545  MachineFunction &MF = CCInfo.getMachineFunction();
1546  MF.addLiveIn(Reg, RC);
1547  return ArgDescriptor::createRegister(Reg);
1548 }
1549 
1551  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1552 }
1553 
1555  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1556 }
1557 
1559  MachineFunction &MF,
1560  const SIRegisterInfo &TRI,
1562  if (Info.hasWorkItemIDX())
1563  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1564 
1565  if (Info.hasWorkItemIDY())
1566  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1567 
1568  if (Info.hasWorkItemIDZ())
1569  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1570 }
1571 
1573  MachineFunction &MF,
1574  const SIRegisterInfo &TRI,
1576  auto &ArgInfo = Info.getArgInfo();
1577 
1578  // TODO: Unify handling with private memory pointers.
1579 
1580  if (Info.hasDispatchPtr())
1581  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1582 
1583  if (Info.hasQueuePtr())
1584  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1585 
1586  if (Info.hasKernargSegmentPtr())
1587  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1588 
1589  if (Info.hasDispatchID())
1590  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1591 
1592  // flat_scratch_init is not applicable for non-kernel functions.
1593 
1594  if (Info.hasWorkGroupIDX())
1595  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1596 
1597  if (Info.hasWorkGroupIDY())
1598  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1599 
1600  if (Info.hasWorkGroupIDZ())
1601  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1602 
1603  if (Info.hasImplicitArgPtr())
1604  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1605 }
1606 
1607 // Allocate special inputs passed in user SGPRs.
1608 static void allocateHSAUserSGPRs(CCState &CCInfo,
1609  MachineFunction &MF,
1610  const SIRegisterInfo &TRI,
1612  if (Info.hasImplicitBufferPtr()) {
1613  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1614  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1615  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1616  }
1617 
1618  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1619  if (Info.hasPrivateSegmentBuffer()) {
1620  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1621  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1622  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1623  }
1624 
1625  if (Info.hasDispatchPtr()) {
1626  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1627  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1628  CCInfo.AllocateReg(DispatchPtrReg);
1629  }
1630 
1631  if (Info.hasQueuePtr()) {
1632  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1633  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1634  CCInfo.AllocateReg(QueuePtrReg);
1635  }
1636 
1637  if (Info.hasKernargSegmentPtr()) {
1638  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1639  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1640  CCInfo.AllocateReg(InputPtrReg);
1641  }
1642 
1643  if (Info.hasDispatchID()) {
1644  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1645  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1646  CCInfo.AllocateReg(DispatchIDReg);
1647  }
1648 
1649  if (Info.hasFlatScratchInit()) {
1650  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1651  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1652  CCInfo.AllocateReg(FlatScratchInitReg);
1653  }
1654 
1655  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1656  // these from the dispatch pointer.
1657 }
1658 
1659 // Allocate special input registers that are initialized per-wave.
1660 static void allocateSystemSGPRs(CCState &CCInfo,
1661  MachineFunction &MF,
1663  CallingConv::ID CallConv,
1664  bool IsShader) {
1665  if (Info.hasWorkGroupIDX()) {
1666  unsigned Reg = Info.addWorkGroupIDX();
1667  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1668  CCInfo.AllocateReg(Reg);
1669  }
1670 
1671  if (Info.hasWorkGroupIDY()) {
1672  unsigned Reg = Info.addWorkGroupIDY();
1673  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1674  CCInfo.AllocateReg(Reg);
1675  }
1676 
1677  if (Info.hasWorkGroupIDZ()) {
1678  unsigned Reg = Info.addWorkGroupIDZ();
1679  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1680  CCInfo.AllocateReg(Reg);
1681  }
1682 
1683  if (Info.hasWorkGroupInfo()) {
1684  unsigned Reg = Info.addWorkGroupInfo();
1685  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1686  CCInfo.AllocateReg(Reg);
1687  }
1688 
1689  if (Info.hasPrivateSegmentWaveByteOffset()) {
1690  // Scratch wave offset passed in system SGPR.
1691  unsigned PrivateSegmentWaveByteOffsetReg;
1692 
1693  if (IsShader) {
1694  PrivateSegmentWaveByteOffsetReg =
1696 
1697  // This is true if the scratch wave byte offset doesn't have a fixed
1698  // location.
1699  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1700  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1701  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1702  }
1703  } else
1704  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1705 
1706  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1707  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1708  }
1709 }
1710 
1712  MachineFunction &MF,
1713  const SIRegisterInfo &TRI,
1715  // Now that we've figured out where the scratch register inputs are, see if
1716  // should reserve the arguments and use them directly.
1717  MachineFrameInfo &MFI = MF.getFrameInfo();
1718  bool HasStackObjects = MFI.hasStackObjects();
1719 
1720  // Record that we know we have non-spill stack objects so we don't need to
1721  // check all stack objects later.
1722  if (HasStackObjects)
1723  Info.setHasNonSpillStackObjects(true);
1724 
1725  // Everything live out of a block is spilled with fast regalloc, so it's
1726  // almost certain that spilling will be required.
1727  if (TM.getOptLevel() == CodeGenOpt::None)
1728  HasStackObjects = true;
1729 
1730  // For now assume stack access is needed in any callee functions, so we need
1731  // the scratch registers to pass in.
1732  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1733 
1734  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1735  if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1736  if (RequiresStackAccess) {
1737  // If we have stack objects, we unquestionably need the private buffer
1738  // resource. For the Code Object V2 ABI, this will be the first 4 user
1739  // SGPR inputs. We can reserve those and use them directly.
1740 
1741  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1743  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1744 
1745  if (MFI.hasCalls()) {
1746  // If we have calls, we need to keep the frame register in a register
1747  // that won't be clobbered by a call, so ensure it is copied somewhere.
1748 
1749  // This is not a problem for the scratch wave offset, because the same
1750  // registers are reserved in all functions.
1751 
1752  // FIXME: Nothing is really ensuring this is a call preserved register,
1753  // it's just selected from the end so it happens to be.
1754  unsigned ReservedOffsetReg
1756  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1757  } else {
1758  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1760  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1761  }
1762  } else {
1763  unsigned ReservedBufferReg
1765  unsigned ReservedOffsetReg
1767 
1768  // We tentatively reserve the last registers (skipping the last two
1769  // which may contain VCC). After register allocation, we'll replace
1770  // these with the ones immediately after those which were really
1771  // allocated. In the prologue copies will be inserted from the argument
1772  // to these reserved registers.
1773  Info.setScratchRSrcReg(ReservedBufferReg);
1774  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1775  }
1776  } else {
1777  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1778 
1779  // Without HSA, relocations are used for the scratch pointer and the
1780  // buffer resource setup is always inserted in the prologue. Scratch wave
1781  // offset is still in an input SGPR.
1782  Info.setScratchRSrcReg(ReservedBufferReg);
1783 
1784  if (HasStackObjects && !MFI.hasCalls()) {
1785  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1787  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1788  } else {
1789  unsigned ReservedOffsetReg
1791  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1792  }
1793  }
1794 }
1795 
1798  return !Info->isEntryFunction();
1799 }
1800 
1802 
1803 }
1804 
1806  MachineBasicBlock *Entry,
1807  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1809 
1810  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1811  if (!IStart)
1812  return;
1813 
1814  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1815  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1816  MachineBasicBlock::iterator MBBI = Entry->begin();
1817  for (const MCPhysReg *I = IStart; *I; ++I) {
1818  const TargetRegisterClass *RC = nullptr;
1819  if (AMDGPU::SReg_64RegClass.contains(*I))
1820  RC = &AMDGPU::SGPR_64RegClass;
1821  else if (AMDGPU::SReg_32RegClass.contains(*I))
1822  RC = &AMDGPU::SGPR_32RegClass;
1823  else
1824  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1825 
1826  unsigned NewVR = MRI->createVirtualRegister(RC);
1827  // Create copy from CSR to a virtual register.
1828  Entry->addLiveIn(*I);
1829  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1830  .addReg(*I);
1831 
1832  // Insert the copy-back instructions right before the terminator.
1833  for (auto *Exit : Exits)
1834  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1835  TII->get(TargetOpcode::COPY), *I)
1836  .addReg(NewVR);
1837  }
1838 }
1839 
1841  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1842  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1843  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1845 
1846  MachineFunction &MF = DAG.getMachineFunction();
1847  const Function &Fn = MF.getFunction();
1848  FunctionType *FType = MF.getFunction().getFunctionType();
1850  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1851 
1852  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1853  DiagnosticInfoUnsupported NoGraphicsHSA(
1854  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1855  DAG.getContext()->diagnose(NoGraphicsHSA);
1856  return DAG.getEntryNode();
1857  }
1858 
1859  // Create stack objects that are used for emitting debugger prologue if
1860  // "amdgpu-debugger-emit-prologue" attribute was specified.
1861  if (ST.debuggerEmitPrologue())
1862  createDebuggerPrologueStackObjects(MF);
1863 
1866  BitVector Skipped(Ins.size());
1867  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1868  *DAG.getContext());
1869 
1870  bool IsShader = AMDGPU::isShader(CallConv);
1871  bool IsKernel = AMDGPU::isKernel(CallConv);
1872  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1873 
1874  if (!IsEntryFunc) {
1875  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1876  // this when allocating argument fixed offsets.
1877  CCInfo.AllocateStack(4, 4);
1878  }
1879 
1880  if (IsShader) {
1881  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1882 
1883  // At least one interpolation mode must be enabled or else the GPU will
1884  // hang.
1885  //
1886  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1887  // set PSInputAddr, the user wants to enable some bits after the compilation
1888  // based on run-time states. Since we can't know what the final PSInputEna
1889  // will look like, so we shouldn't do anything here and the user should take
1890  // responsibility for the correct programming.
1891  //
1892  // Otherwise, the following restrictions apply:
1893  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1894  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1895  // enabled too.
1896  if (CallConv == CallingConv::AMDGPU_PS) {
1897  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1898  ((Info->getPSInputAddr() & 0xF) == 0 &&
1899  Info->isPSInputAllocated(11))) {
1900  CCInfo.AllocateReg(AMDGPU::VGPR0);
1901  CCInfo.AllocateReg(AMDGPU::VGPR1);
1902  Info->markPSInputAllocated(0);
1903  Info->markPSInputEnabled(0);
1904  }
1905  if (Subtarget->isAmdPalOS()) {
1906  // For isAmdPalOS, the user does not enable some bits after compilation
1907  // based on run-time states; the register values being generated here are
1908  // the final ones set in hardware. Therefore we need to apply the
1909  // workaround to PSInputAddr and PSInputEnable together. (The case where
1910  // a bit is set in PSInputAddr but not PSInputEnable is where the
1911  // frontend set up an input arg for a particular interpolation mode, but
1912  // nothing uses that input arg. Really we should have an earlier pass
1913  // that removes such an arg.)
1914  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1915  if ((PsInputBits & 0x7F) == 0 ||
1916  ((PsInputBits & 0xF) == 0 &&
1917  (PsInputBits >> 11 & 1)))
1918  Info->markPSInputEnabled(
1920  }
1921  }
1922 
1923  assert(!Info->hasDispatchPtr() &&
1924  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1925  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1926  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1927  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1928  !Info->hasWorkItemIDZ());
1929  } else if (IsKernel) {
1930  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1931  } else {
1932  Splits.append(Ins.begin(), Ins.end());
1933  }
1934 
1935  if (IsEntryFunc) {
1936  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1937  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1938  }
1939 
1940  if (IsKernel) {
1941  analyzeFormalArgumentsCompute(CCInfo, Ins);
1942  } else {
1943  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1944  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1945  }
1946 
1947  SmallVector<SDValue, 16> Chains;
1948 
1949  // FIXME: This is the minimum kernel argument alignment. We should improve
1950  // this to the maximum alignment of the arguments.
1951  //
1952  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1953  // kern arg offset.
1954  const unsigned KernelArgBaseAlign = 16;
1955 
1956  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1957  const ISD::InputArg &Arg = Ins[i];
1958  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1959  InVals.push_back(DAG.getUNDEF(Arg.VT));
1960  continue;
1961  }
1962 
1963  CCValAssign &VA = ArgLocs[ArgIdx++];
1964  MVT VT = VA.getLocVT();
1965 
1966  if (IsEntryFunc && VA.isMemLoc()) {
1967  VT = Ins[i].VT;
1968  EVT MemVT = VA.getLocVT();
1969 
1970  const uint64_t Offset = VA.getLocMemOffset();
1971  unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1972 
1973  SDValue Arg = lowerKernargMemParameter(
1974  DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1975  Chains.push_back(Arg.getValue(1));
1976 
1977  auto *ParamTy =
1978  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1979  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1980  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1981  // On SI local pointers are just offsets into LDS, so they are always
1982  // less than 16-bits. On CI and newer they could potentially be
1983  // real pointers, so we can't guarantee their size.
1984  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1985  DAG.getValueType(MVT::i16));
1986  }
1987 
1988  InVals.push_back(Arg);
1989  continue;
1990  } else if (!IsEntryFunc && VA.isMemLoc()) {
1991  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1992  InVals.push_back(Val);
1993  if (!Arg.Flags.isByVal())
1994  Chains.push_back(Val.getValue(1));
1995  continue;
1996  }
1997 
1998  assert(VA.isRegLoc() && "Parameter must be in a register!");
1999 
2000  unsigned Reg = VA.getLocReg();
2001  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2002  EVT ValVT = VA.getValVT();
2003 
2004  Reg = MF.addLiveIn(Reg, RC);
2005  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2006 
2007  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2008  // The return object should be reasonably addressable.
2009 
2010  // FIXME: This helps when the return is a real sret. If it is a
2011  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2012  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2013  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2014  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2015  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2016  }
2017 
2018  // If this is an 8 or 16-bit value, it is really passed promoted
2019  // to 32 bits. Insert an assert[sz]ext to capture this, then
2020  // truncate to the right size.
2021  switch (VA.getLocInfo()) {
2022  case CCValAssign::Full:
2023  break;
2024  case CCValAssign::BCvt:
2025  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2026  break;
2027  case CCValAssign::SExt:
2028  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2029  DAG.getValueType(ValVT));
2030  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2031  break;
2032  case CCValAssign::ZExt:
2033  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2034  DAG.getValueType(ValVT));
2035  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2036  break;
2037  case CCValAssign::AExt:
2038  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2039  break;
2040  default:
2041  llvm_unreachable("Unknown loc info!");
2042  }
2043 
2044  InVals.push_back(Val);
2045  }
2046 
2047  if (!IsEntryFunc) {
2048  // Special inputs come after user arguments.
2049  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2050  }
2051 
2052  // Start adding system SGPRs.
2053  if (IsEntryFunc) {
2054  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2055  } else {
2056  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2057  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2058  CCInfo.AllocateReg(Info->getFrameOffsetReg());
2059  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2060  }
2061 
2062  auto &ArgUsageInfo =
2064  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2065 
2066  unsigned StackArgSize = CCInfo.getNextStackOffset();
2067  Info->setBytesInStackArgArea(StackArgSize);
2068 
2069  return Chains.empty() ? Chain :
2070  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2071 }
2072 
2073 // TODO: If return values can't fit in registers, we should return as many as
2074 // possible in registers before passing on stack.
2076  CallingConv::ID CallConv,
2077  MachineFunction &MF, bool IsVarArg,
2078  const SmallVectorImpl<ISD::OutputArg> &Outs,
2079  LLVMContext &Context) const {
2080  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2081  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2082  // for shaders. Vector types should be explicitly handled by CC.
2083  if (AMDGPU::isEntryFunctionCC(CallConv))
2084  return true;
2085 
2087  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2088  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2089 }
2090 
2091 SDValue
2093  bool isVarArg,
2094  const SmallVectorImpl<ISD::OutputArg> &Outs,
2095  const SmallVectorImpl<SDValue> &OutVals,
2096  const SDLoc &DL, SelectionDAG &DAG) const {
2097  MachineFunction &MF = DAG.getMachineFunction();
2099 
2100  if (AMDGPU::isKernel(CallConv)) {
2101  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2102  OutVals, DL, DAG);
2103  }
2104 
2105  bool IsShader = AMDGPU::isShader(CallConv);
2106 
2107  Info->setIfReturnsVoid(Outs.empty());
2108  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2109 
2110  // CCValAssign - represent the assignment of the return value to a location.
2113 
2114  // CCState - Info about the registers and stack slots.
2115  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2116  *DAG.getContext());
2117 
2118  // Analyze outgoing return values.
2119  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2120 
2121  SDValue Flag;
2122  SmallVector<SDValue, 48> RetOps;
2123  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2124 
2125  // Add return address for callable functions.
2126  if (!Info->isEntryFunction()) {
2128  SDValue ReturnAddrReg = CreateLiveInRegister(
2129  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2130 
2131  // FIXME: Should be able to use a vreg here, but need a way to prevent it
2132  // from being allcoated to a CSR.
2133 
2134  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2135  MVT::i64);
2136 
2137  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2138  Flag = Chain.getValue(1);
2139 
2140  RetOps.push_back(PhysReturnAddrReg);
2141  }
2142 
2143  // Copy the result values into the output registers.
2144  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2145  ++I, ++RealRVLocIdx) {
2146  CCValAssign &VA = RVLocs[I];
2147  assert(VA.isRegLoc() && "Can only return in registers!");
2148  // TODO: Partially return in registers if return values don't fit.
2149  SDValue Arg = OutVals[RealRVLocIdx];
2150 
2151  // Copied from other backends.
2152  switch (VA.getLocInfo()) {
2153  case CCValAssign::Full:
2154  break;
2155  case CCValAssign::BCvt:
2156  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2157  break;
2158  case CCValAssign::SExt:
2159  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2160  break;
2161  case CCValAssign::ZExt:
2162  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2163  break;
2164  case CCValAssign::AExt:
2165  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2166  break;
2167  default:
2168  llvm_unreachable("Unknown loc info!");
2169  }
2170 
2171  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2172  Flag = Chain.getValue(1);
2173  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2174  }
2175 
2176  // FIXME: Does sret work properly?
2177  if (!Info->isEntryFunction()) {
2178  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2179  const MCPhysReg *I =
2181  if (I) {
2182  for (; *I; ++I) {
2183  if (AMDGPU::SReg_64RegClass.contains(*I))
2184  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2185  else if (AMDGPU::SReg_32RegClass.contains(*I))
2186  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2187  else
2188  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2189  }
2190  }
2191  }
2192 
2193  // Update chain and glue.
2194  RetOps[0] = Chain;
2195  if (Flag.getNode())
2196  RetOps.push_back(Flag);
2197 
2198  unsigned Opc = AMDGPUISD::ENDPGM;
2199  if (!IsWaveEnd)
2201  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2202 }
2203 
2205  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2206  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2207  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2208  SDValue ThisVal) const {
2209  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2210 
2211  // Assign locations to each value returned by this call.
2213  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2214  *DAG.getContext());
2215  CCInfo.AnalyzeCallResult(Ins, RetCC);
2216 
2217  // Copy all of the result registers out of their specified physreg.
2218  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2219  CCValAssign VA = RVLocs[i];
2220  SDValue Val;
2221 
2222  if (VA.isRegLoc()) {
2223  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2224  Chain = Val.getValue(1);
2225  InFlag = Val.getValue(2);
2226  } else if (VA.isMemLoc()) {
2227  report_fatal_error("TODO: return values in memory");
2228  } else
2229  llvm_unreachable("unknown argument location type");
2230 
2231  switch (VA.getLocInfo()) {
2232  case CCValAssign::Full:
2233  break;
2234  case CCValAssign::BCvt:
2235  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2236  break;
2237  case CCValAssign::ZExt:
2238  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2239  DAG.getValueType(VA.getValVT()));
2240  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2241  break;
2242  case CCValAssign::SExt:
2243  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2244  DAG.getValueType(VA.getValVT()));
2245  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2246  break;
2247  case CCValAssign::AExt:
2248  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2249  break;
2250  default:
2251  llvm_unreachable("Unknown loc info!");
2252  }
2253 
2254  InVals.push_back(Val);
2255  }
2256 
2257  return Chain;
2258 }
2259 
2260 // Add code to pass special inputs required depending on used features separate
2261 // from the explicit user arguments present in the IR.
2263  CallLoweringInfo &CLI,
2264  CCState &CCInfo,
2265  const SIMachineFunctionInfo &Info,
2266  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2267  SmallVectorImpl<SDValue> &MemOpChains,
2268  SDValue Chain) const {
2269  // If we don't have a call site, this was a call inserted by
2270  // legalization. These can never use special inputs.
2271  if (!CLI.CS)
2272  return;
2273 
2274  const Function *CalleeFunc = CLI.CS.getCalledFunction();
2275  assert(CalleeFunc);
2276 
2277  SelectionDAG &DAG = CLI.DAG;
2278  const SDLoc &DL = CLI.DL;
2279 
2280  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2281 
2282  auto &ArgUsageInfo =
2284  const AMDGPUFunctionArgInfo &CalleeArgInfo
2285  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2286 
2287  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2288 
2289  // TODO: Unify with private memory register handling. This is complicated by
2290  // the fact that at least in kernels, the input argument is not necessarily
2291  // in the same location as the input.
2304  };
2305 
2306  for (auto InputID : InputRegs) {
2307  const ArgDescriptor *OutgoingArg;
2308  const TargetRegisterClass *ArgRC;
2309 
2310  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2311  if (!OutgoingArg)
2312  continue;
2313 
2314  const ArgDescriptor *IncomingArg;
2315  const TargetRegisterClass *IncomingArgRC;
2316  std::tie(IncomingArg, IncomingArgRC)
2317  = CallerArgInfo.getPreloadedValue(InputID);
2318  assert(IncomingArgRC == ArgRC);
2319 
2320  // All special arguments are ints for now.
2321  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2322  SDValue InputReg;
2323 
2324  if (IncomingArg) {
2325  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2326  } else {
2327  // The implicit arg ptr is special because it doesn't have a corresponding
2328  // input for kernels, and is computed from the kernarg segment pointer.
2329  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2330  InputReg = getImplicitArgPtr(DAG, DL);
2331  }
2332 
2333  if (OutgoingArg->isRegister()) {
2334  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2335  } else {
2336  unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2337  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2338  SpecialArgOffset);
2339  MemOpChains.push_back(ArgStore);
2340  }
2341  }
2342 }
2343 
2345  return CC == CallingConv::Fast;
2346 }
2347 
2348 /// Return true if we might ever do TCO for calls with this calling convention.
2350  switch (CC) {
2351  case CallingConv::C:
2352  return true;
2353  default:
2354  return canGuaranteeTCO(CC);
2355  }
2356 }
2357 
2359  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2360  const SmallVectorImpl<ISD::OutputArg> &Outs,
2361  const SmallVectorImpl<SDValue> &OutVals,
2362  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2363  if (!mayTailCallThisCC(CalleeCC))
2364  return false;
2365 
2366  MachineFunction &MF = DAG.getMachineFunction();
2367  const Function &CallerF = MF.getFunction();
2368  CallingConv::ID CallerCC = CallerF.getCallingConv();
2370  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2371 
2372  // Kernels aren't callable, and don't have a live in return address so it
2373  // doesn't make sense to do a tail call with entry functions.
2374  if (!CallerPreserved)
2375  return false;
2376 
2377  bool CCMatch = CallerCC == CalleeCC;
2378 
2380  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2381  return true;
2382  return false;
2383  }
2384 
2385  // TODO: Can we handle var args?
2386  if (IsVarArg)
2387  return false;
2388 
2389  for (const Argument &Arg : CallerF.args()) {
2390  if (Arg.hasByValAttr())
2391  return false;
2392  }
2393 
2394  LLVMContext &Ctx = *DAG.getContext();
2395 
2396  // Check that the call results are passed in the same way.
2397  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2398  CCAssignFnForCall(CalleeCC, IsVarArg),
2399  CCAssignFnForCall(CallerCC, IsVarArg)))
2400  return false;
2401 
2402  // The callee has to preserve all registers the caller needs to preserve.
2403  if (!CCMatch) {
2404  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2405  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2406  return false;
2407  }
2408 
2409  // Nothing more to check if the callee is taking no arguments.
2410  if (Outs.empty())
2411  return true;
2412 
2414  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2415 
2416  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2417 
2418  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2419  // If the stack arguments for this call do not fit into our own save area then
2420  // the call cannot be made tail.
2421  // TODO: Is this really necessary?
2422  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2423  return false;
2424 
2425  const MachineRegisterInfo &MRI = MF.getRegInfo();
2426  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2427 }
2428 
2430  if (!CI->isTailCall())
2431  return false;
2432 
2433  const Function *ParentFn = CI->getParent()->getParent();
2434  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2435  return false;
2436 
2437  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2438  return (Attr.getValueAsString() != "true");
2439 }
2440 
2441 // The wave scratch offset register is used as the global base pointer.
2443  SmallVectorImpl<SDValue> &InVals) const {
2444  SelectionDAG &DAG = CLI.DAG;
2445  const SDLoc &DL = CLI.DL;
2447  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2449  SDValue Chain = CLI.Chain;
2450  SDValue Callee = CLI.Callee;
2451  bool &IsTailCall = CLI.IsTailCall;
2452  CallingConv::ID CallConv = CLI.CallConv;
2453  bool IsVarArg = CLI.IsVarArg;
2454  bool IsSibCall = false;
2455  bool IsThisReturn = false;
2456  MachineFunction &MF = DAG.getMachineFunction();
2457 
2458  if (IsVarArg) {
2459  return lowerUnhandledCall(CLI, InVals,
2460  "unsupported call to variadic function ");
2461  }
2462 
2463  if (!CLI.CS.getInstruction())
2464  report_fatal_error("unsupported libcall legalization");
2465 
2466  if (!CLI.CS.getCalledFunction()) {
2467  return lowerUnhandledCall(CLI, InVals,
2468  "unsupported indirect call to function ");
2469  }
2470 
2471  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2472  return lowerUnhandledCall(CLI, InVals,
2473  "unsupported required tail call to function ");
2474  }
2475 
2477  // Note the issue is with the CC of the calling function, not of the call
2478  // itself.
2479  return lowerUnhandledCall(CLI, InVals,
2480  "unsupported call from graphics shader of function ");
2481  }
2482 
2483  // The first 4 bytes are reserved for the callee's emergency stack slot.
2484  if (IsTailCall) {
2485  IsTailCall = isEligibleForTailCallOptimization(
2486  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2487  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2488  report_fatal_error("failed to perform tail call elimination on a call "
2489  "site marked musttail");
2490  }
2491 
2492  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2493 
2494  // A sibling call is one where we're under the usual C ABI and not planning
2495  // to change that but can still do a tail call:
2496  if (!TailCallOpt && IsTailCall)
2497  IsSibCall = true;
2498 
2499  if (IsTailCall)
2500  ++NumTailCalls;
2501  }
2502 
2504 
2505  // Analyze operands of the call, assigning locations to each operand.
2507  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2508  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2509 
2510  // The first 4 bytes are reserved for the callee's emergency stack slot.
2511  CCInfo.AllocateStack(4, 4);
2512 
2513  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2514 
2515  // Get a count of how many bytes are to be pushed on the stack.
2516  unsigned NumBytes = CCInfo.getNextStackOffset();
2517 
2518  if (IsSibCall) {
2519  // Since we're not changing the ABI to make this a tail call, the memory
2520  // operands are already available in the caller's incoming argument space.
2521  NumBytes = 0;
2522  }
2523 
2524  // FPDiff is the byte offset of the call's argument area from the callee's.
2525  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2526  // by this amount for a tail call. In a sibling call it must be 0 because the
2527  // caller will deallocate the entire stack and the callee still expects its
2528  // arguments to begin at SP+0. Completely unused for non-tail calls.
2529  int32_t FPDiff = 0;
2530  MachineFrameInfo &MFI = MF.getFrameInfo();
2532 
2533  SDValue CallerSavedFP;
2534 
2535  // Adjust the stack pointer for the new arguments...
2536  // These operations are automatically eliminated by the prolog/epilog pass
2537  if (!IsSibCall) {
2538  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2539 
2540  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2541 
2542  // In the HSA case, this should be an identity copy.
2543  SDValue ScratchRSrcReg
2544  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2545  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2546 
2547  // TODO: Don't hardcode these registers and get from the callee function.
2548  SDValue ScratchWaveOffsetReg
2549  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2550  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2551 
2552  if (!Info->isEntryFunction()) {
2553  // Avoid clobbering this function's FP value. In the current convention
2554  // callee will overwrite this, so do save/restore around the call site.
2555  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2556  Info->getFrameOffsetReg(), MVT::i32);
2557  }
2558  }
2559 
2560  SmallVector<SDValue, 8> MemOpChains;
2561  MVT PtrVT = MVT::i32;
2562 
2563  // Walk the register/memloc assignments, inserting copies/loads.
2564  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2565  ++i, ++realArgIdx) {
2566  CCValAssign &VA = ArgLocs[i];
2567  SDValue Arg = OutVals[realArgIdx];
2568 
2569  // Promote the value if needed.
2570  switch (VA.getLocInfo()) {
2571  case CCValAssign::Full:
2572  break;
2573  case CCValAssign::BCvt:
2574  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2575  break;
2576  case CCValAssign::ZExt:
2577  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2578  break;
2579  case CCValAssign::SExt:
2580  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2581  break;
2582  case CCValAssign::AExt:
2583  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2584  break;
2585  case CCValAssign::FPExt:
2586  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2587  break;
2588  default:
2589  llvm_unreachable("Unknown loc info!");
2590  }
2591 
2592  if (VA.isRegLoc()) {
2593  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2594  } else {
2595  assert(VA.isMemLoc());
2596 
2597  SDValue DstAddr;
2598  MachinePointerInfo DstInfo;
2599 
2600  unsigned LocMemOffset = VA.getLocMemOffset();
2601  int32_t Offset = LocMemOffset;
2602 
2603  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2604  unsigned Align = 0;
2605 
2606  if (IsTailCall) {
2607  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2608  unsigned OpSize = Flags.isByVal() ?
2609  Flags.getByValSize() : VA.getValVT().getStoreSize();
2610 
2611  // FIXME: We can have better than the minimum byval required alignment.
2612  Align = Flags.isByVal() ? Flags.getByValAlign() :
2613  MinAlign(Subtarget->getStackAlignment(), Offset);
2614 
2615  Offset = Offset + FPDiff;
2616  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2617 
2618  DstAddr = DAG.getFrameIndex(FI, PtrVT);
2619  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2620 
2621  // Make sure any stack arguments overlapping with where we're storing
2622  // are loaded before this eventual operation. Otherwise they'll be
2623  // clobbered.
2624 
2625  // FIXME: Why is this really necessary? This seems to just result in a
2626  // lot of code to copy the stack and write them back to the same
2627  // locations, which are supposed to be immutable?
2628  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2629  } else {
2630  DstAddr = PtrOff;
2631  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2632  Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2633  }
2634 
2635  if (Outs[i].Flags.isByVal()) {
2636  SDValue SizeNode =
2637  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2638  SDValue Cpy = DAG.getMemcpy(
2639  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2640  /*isVol = */ false, /*AlwaysInline = */ true,
2641  /*isTailCall = */ false, DstInfo,
2644 
2645  MemOpChains.push_back(Cpy);
2646  } else {
2647  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2648  MemOpChains.push_back(Store);
2649  }
2650  }
2651  }
2652 
2653  // Copy special input registers after user input arguments.
2654  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2655 
2656  if (!MemOpChains.empty())
2657  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2658 
2659  // Build a sequence of copy-to-reg nodes chained together with token chain
2660  // and flag operands which copy the outgoing args into the appropriate regs.
2661  SDValue InFlag;
2662  for (auto &RegToPass : RegsToPass) {
2663  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2664  RegToPass.second, InFlag);
2665  InFlag = Chain.getValue(1);
2666  }
2667 
2668 
2669  SDValue PhysReturnAddrReg;
2670  if (IsTailCall) {
2671  // Since the return is being combined with the call, we need to pass on the
2672  // return address.
2673 
2675  SDValue ReturnAddrReg = CreateLiveInRegister(
2676  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2677 
2678  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2679  MVT::i64);
2680  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2681  InFlag = Chain.getValue(1);
2682  }
2683 
2684  // We don't usually want to end the call-sequence here because we would tidy
2685  // the frame up *after* the call, however in the ABI-changing tail-call case
2686  // we've carefully laid out the parameters so that when sp is reset they'll be
2687  // in the correct location.
2688  if (IsTailCall && !IsSibCall) {
2689  Chain = DAG.getCALLSEQ_END(Chain,
2690  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2691  DAG.getTargetConstant(0, DL, MVT::i32),
2692  InFlag, DL);
2693  InFlag = Chain.getValue(1);
2694  }
2695 
2696  std::vector<SDValue> Ops;
2697  Ops.push_back(Chain);
2698  Ops.push_back(Callee);
2699 
2700  if (IsTailCall) {
2701  // Each tail call may have to adjust the stack by a different amount, so
2702  // this information must travel along with the operation for eventual
2703  // consumption by emitEpilogue.
2704  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2705 
2706  Ops.push_back(PhysReturnAddrReg);
2707  }
2708 
2709  // Add argument registers to the end of the list so that they are known live
2710  // into the call.
2711  for (auto &RegToPass : RegsToPass) {
2712  Ops.push_back(DAG.getRegister(RegToPass.first,
2713  RegToPass.second.getValueType()));
2714  }
2715 
2716  // Add a register mask operand representing the call-preserved registers.
2717 
2718  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2719  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2720  assert(Mask && "Missing call preserved mask for calling convention");
2721  Ops.push_back(DAG.getRegisterMask(Mask));
2722 
2723  if (InFlag.getNode())
2724  Ops.push_back(InFlag);
2725 
2726  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2727 
2728  // If we're doing a tall call, use a TC_RETURN here rather than an
2729  // actual call instruction.
2730  if (IsTailCall) {
2731  MFI.setHasTailCall();
2732  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2733  }
2734 
2735  // Returns a chain and a flag for retval copy to use.
2736  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2737  Chain = Call.getValue(0);
2738  InFlag = Call.getValue(1);
2739 
2740  if (CallerSavedFP) {
2741  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2742  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2743  InFlag = Chain.getValue(1);
2744  }
2745 
2746  uint64_t CalleePopBytes = NumBytes;
2747  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2748  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2749  InFlag, DL);
2750  if (!Ins.empty())
2751  InFlag = Chain.getValue(1);
2752 
2753  // Handle result values, copying them out of physregs into vregs that we
2754  // return.
2755  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2756  InVals, IsThisReturn,
2757  IsThisReturn ? OutVals[0] : SDValue());
2758 }
2759 
2760 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2761  SelectionDAG &DAG) const {
2762  unsigned Reg = StringSwitch<unsigned>(RegName)
2763  .Case("m0", AMDGPU::M0)
2764  .Case("exec", AMDGPU::EXEC)
2765  .Case("exec_lo", AMDGPU::EXEC_LO)
2766  .Case("exec_hi", AMDGPU::EXEC_HI)
2767  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2768  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2769  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2770  .Default(AMDGPU::NoRegister);
2771 
2772  if (Reg == AMDGPU::NoRegister) {
2773  report_fatal_error(Twine("invalid register name \""
2774  + StringRef(RegName) + "\"."));
2775 
2776  }
2777 
2778  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2779  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2780  report_fatal_error(Twine("invalid register \""
2781  + StringRef(RegName) + "\" for subtarget."));
2782  }
2783 
2784  switch (Reg) {
2785  case AMDGPU::M0:
2786  case AMDGPU::EXEC_LO:
2787  case AMDGPU::EXEC_HI:
2788  case AMDGPU::FLAT_SCR_LO:
2789  case AMDGPU::FLAT_SCR_HI:
2790  if (VT.getSizeInBits() == 32)
2791  return Reg;
2792  break;
2793  case AMDGPU::EXEC:
2794  case AMDGPU::FLAT_SCR:
2795  if (VT.getSizeInBits() == 64)
2796  return Reg;
2797  break;
2798  default:
2799  llvm_unreachable("missing register type checking");
2800  }
2801 
2802  report_fatal_error(Twine("invalid type for register \""
2803  + StringRef(RegName) + "\"."));
2804 }
2805 
2806 // If kill is not the last instruction, split the block so kill is always a
2807 // proper terminator.
2809  MachineBasicBlock *BB) const {
2810  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2811 
2812  MachineBasicBlock::iterator SplitPoint(&MI);
2813  ++SplitPoint;
2814 
2815  if (SplitPoint == BB->end()) {
2816  // Don't bother with a new block.
2818  return BB;
2819  }
2820 
2821  MachineFunction *MF = BB->getParent();
2822  MachineBasicBlock *SplitBB
2824 
2825  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2826  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2827 
2828  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2829  BB->addSuccessor(SplitBB);
2830 
2832  return SplitBB;
2833 }
2834 
2835 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2836 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2837 // will only do one iteration. In the worst case, this will loop 64 times.
2838 //
2839 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2841  const SIInstrInfo *TII,
2843  MachineBasicBlock &OrigBB,
2844  MachineBasicBlock &LoopBB,
2845  const DebugLoc &DL,
2846  const MachineOperand &IdxReg,
2847  unsigned InitReg,
2848  unsigned ResultReg,
2849  unsigned PhiReg,
2850  unsigned InitSaveExecReg,
2851  int Offset,
2852  bool UseGPRIdxMode,
2853  bool IsIndirectSrc) {
2854  MachineBasicBlock::iterator I = LoopBB.begin();
2855 
2856  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2857  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2858  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2859  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2860 
2861  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2862  .addReg(InitReg)
2863  .addMBB(&OrigBB)
2864  .addReg(ResultReg)
2865  .addMBB(&LoopBB);
2866 
2867  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2868  .addReg(InitSaveExecReg)
2869  .addMBB(&OrigBB)
2870  .addReg(NewExec)
2871  .addMBB(&LoopBB);
2872 
2873  // Read the next variant <- also loop target.
2874  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2875  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2876 
2877  // Compare the just read M0 value to all possible Idx values.
2878  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2879  .addReg(CurrentIdxReg)
2880  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2881 
2882  // Update EXEC, save the original EXEC value to VCC.
2883  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2884  .addReg(CondReg, RegState::Kill);
2885 
2886  MRI.setSimpleHint(NewExec, CondReg);
2887 
2888  if (UseGPRIdxMode) {
2889  unsigned IdxReg;
2890  if (Offset == 0) {
2891  IdxReg = CurrentIdxReg;
2892  } else {
2893  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2894  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2895  .addReg(CurrentIdxReg, RegState::Kill)
2896  .addImm(Offset);
2897  }
2898  unsigned IdxMode = IsIndirectSrc ?
2900  MachineInstr *SetOn =
2901  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2902  .addReg(IdxReg, RegState::Kill)
2903  .addImm(IdxMode);
2904  SetOn->getOperand(3).setIsUndef();
2905  } else {
2906  // Move index from VCC into M0
2907  if (Offset == 0) {
2908  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2909  .addReg(CurrentIdxReg, RegState::Kill);
2910  } else {
2911  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2912  .addReg(CurrentIdxReg, RegState::Kill)
2913  .addImm(Offset);
2914  }
2915  }
2916 
2917  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2918  MachineInstr *InsertPt =
2919  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2920  .addReg(AMDGPU::EXEC)
2921  .addReg(NewExec);
2922 
2923  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2924  // s_cbranch_scc0?
2925 
2926  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2927  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2928  .addMBB(&LoopBB);
2929 
2930  return InsertPt->getIterator();
2931 }
2932 
2933 // This has slightly sub-optimal regalloc when the source vector is killed by
2934 // the read. The register allocator does not understand that the kill is
2935 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2936 // subregister from it, using 1 more VGPR than necessary. This was saved when
2937 // this was expanded after register allocation.
2939  MachineBasicBlock &MBB,
2940  MachineInstr &MI,
2941  unsigned InitResultReg,
2942  unsigned PhiReg,
2943  int Offset,
2944  bool UseGPRIdxMode,
2945  bool IsIndirectSrc) {
2946  MachineFunction *MF = MBB.getParent();
2948  const DebugLoc &DL = MI.getDebugLoc();
2950 
2951  unsigned DstReg = MI.getOperand(0).getReg();
2952  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2953  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2954 
2955  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2956 
2957  // Save the EXEC mask
2958  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2959  .addReg(AMDGPU::EXEC);
2960 
2961  // To insert the loop we need to split the block. Move everything after this
2962  // point to a new block, and insert a new empty block between the two.
2964  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2965  MachineFunction::iterator MBBI(MBB);
2966  ++MBBI;
2967 
2968  MF->insert(MBBI, LoopBB);
2969  MF->insert(MBBI, RemainderBB);
2970 
2971  LoopBB->addSuccessor(LoopBB);
2972  LoopBB->addSuccessor(RemainderBB);
2973 
2974  // Move the rest of the block into a new block.
2975  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2976  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2977 
2978  MBB.addSuccessor(LoopBB);
2979 
2980  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2981 
2982  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2983  InitResultReg, DstReg, PhiReg, TmpExec,
2984  Offset, UseGPRIdxMode, IsIndirectSrc);
2985 
2986  MachineBasicBlock::iterator First = RemainderBB->begin();
2987  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2988  .addReg(SaveExec);
2989 
2990  return InsPt;
2991 }
2992 
2993 // Returns subreg index, offset
2994 static std::pair<unsigned, int>
2996  const TargetRegisterClass *SuperRC,
2997  unsigned VecReg,
2998  int Offset) {
2999  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3000 
3001  // Skip out of bounds offsets, or else we would end up using an undefined
3002  // register.
3003  if (Offset >= NumElts || Offset < 0)
3004  return std::make_pair(AMDGPU::sub0, Offset);
3005 
3006  return std::make_pair(AMDGPU::sub0 + Offset, 0);
3007 }
3008 
3009 // Return true if the index is an SGPR and was set.
3012  MachineInstr &MI,
3013  int Offset,
3014  bool UseGPRIdxMode,
3015  bool IsIndirectSrc) {
3016  MachineBasicBlock *MBB = MI.getParent();
3017  const DebugLoc &DL = MI.getDebugLoc();
3019 
3020  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3021  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3022 
3023  assert(Idx->getReg() != AMDGPU::NoRegister);
3024 
3025  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3026  return false;
3027 
3028  if (UseGPRIdxMode) {
3029  unsigned IdxMode = IsIndirectSrc ?
3031  if (Offset == 0) {
3032  MachineInstr *SetOn =
3033  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3034  .add(*Idx)
3035  .addImm(IdxMode);
3036 
3037  SetOn->getOperand(3).setIsUndef();
3038  } else {
3039  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3040  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3041  .add(*Idx)
3042  .addImm(Offset);
3043  MachineInstr *SetOn =
3044  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3045  .addReg(Tmp, RegState::Kill)
3046  .addImm(IdxMode);
3047 
3048  SetOn->getOperand(3).setIsUndef();
3049  }
3050 
3051  return true;
3052  }
3053 
3054  if (Offset == 0) {
3055  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3056  .add(*Idx);
3057  } else {
3058  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3059  .add(*Idx)
3060  .addImm(Offset);
3061  }
3062 
3063  return true;
3064 }
3065 
3066 // Control flow needs to be inserted if indexing with a VGPR.
3068  MachineBasicBlock &MBB,
3069  const GCNSubtarget &ST) {
3070  const SIInstrInfo *TII = ST.getInstrInfo();
3071  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3072  MachineFunction *MF = MBB.getParent();
3074 
3075  unsigned Dst = MI.getOperand(0).getReg();
3076  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3077  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3078 
3079  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3080 
3081  unsigned SubReg;
3082  std::tie(SubReg, Offset)
3083  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3084 
3085  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3086 
3087  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3089  const DebugLoc &DL = MI.getDebugLoc();
3090 
3091  if (UseGPRIdxMode) {
3092  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3093  // to avoid interfering with other uses, so probably requires a new
3094  // optimization pass.
3095  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3096  .addReg(SrcReg, RegState::Undef, SubReg)
3097  .addReg(SrcReg, RegState::Implicit)
3098  .addReg(AMDGPU::M0, RegState::Implicit);
3099  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3100  } else {
3101  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3102  .addReg(SrcReg, RegState::Undef, SubReg)
3103  .addReg(SrcReg, RegState::Implicit);
3104  }
3105 
3106  MI.eraseFromParent();
3107 
3108  return &MBB;
3109  }
3110 
3111  const DebugLoc &DL = MI.getDebugLoc();
3113 
3114  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3115  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3116 
3117  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3118 
3119  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3120  Offset, UseGPRIdxMode, true);
3121  MachineBasicBlock *LoopBB = InsPt->getParent();
3122 
3123  if (UseGPRIdxMode) {
3124  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3125  .addReg(SrcReg, RegState::Undef, SubReg)
3126  .addReg(SrcReg, RegState::Implicit)
3127  .addReg(AMDGPU::M0, RegState::Implicit);
3128  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3129  } else {
3130  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3131  .addReg(SrcReg, RegState::Undef, SubReg)
3132  .addReg(SrcReg, RegState::Implicit);
3133  }
3134 
3135  MI.eraseFromParent();
3136 
3137  return LoopBB;
3138 }
3139 
3140 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3141  const TargetRegisterClass *VecRC) {
3142  switch (TRI.getRegSizeInBits(*VecRC)) {
3143  case 32: // 4 bytes
3144  return AMDGPU::V_MOVRELD_B32_V1;
3145  case 64: // 8 bytes
3146  return AMDGPU::V_MOVRELD_B32_V2;
3147  case 128: // 16 bytes
3148  return AMDGPU::V_MOVRELD_B32_V4;
3149  case 256: // 32 bytes
3150  return AMDGPU::V_MOVRELD_B32_V8;
3151  case 512: // 64 bytes
3152  return AMDGPU::V_MOVRELD_B32_V16;
3153  default:
3154  llvm_unreachable("unsupported size for MOVRELD pseudos");
3155  }
3156 }
3157 
3159  MachineBasicBlock &MBB,
3160  const GCNSubtarget &ST) {
3161  const SIInstrInfo *TII = ST.getInstrInfo();
3162  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3163  MachineFunction *MF = MBB.getParent();
3165 
3166  unsigned Dst = MI.getOperand(0).getReg();
3167  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3168  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3169  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3170  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3171  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3172 
3173  // This can be an immediate, but will be folded later.
3174  assert(Val->getReg());
3175 
3176  unsigned SubReg;
3177  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3178  SrcVec->getReg(),
3179  Offset);
3180  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3181 
3182  if (Idx->getReg() == AMDGPU::NoRegister) {
3184  const DebugLoc &DL = MI.getDebugLoc();
3185 
3186  assert(Offset == 0);
3187 
3188  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3189  .add(*SrcVec)
3190  .add(*Val)
3191  .addImm(SubReg);
3192 
3193  MI.eraseFromParent();
3194  return &MBB;
3195  }
3196 
3197  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3199  const DebugLoc &DL = MI.getDebugLoc();
3200 
3201  if (UseGPRIdxMode) {
3202  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3203  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3204  .add(*Val)
3205  .addReg(Dst, RegState::ImplicitDefine)
3206  .addReg(SrcVec->getReg(), RegState::Implicit)
3207  .addReg(AMDGPU::M0, RegState::Implicit);
3208 
3209  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3210  } else {
3211  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3212 
3213  BuildMI(MBB, I, DL, MovRelDesc)
3214  .addReg(Dst, RegState::Define)
3215  .addReg(SrcVec->getReg())
3216  .add(*Val)
3217  .addImm(SubReg - AMDGPU::sub0);
3218  }
3219 
3220  MI.eraseFromParent();
3221  return &MBB;
3222  }
3223 
3224  if (Val->isReg())
3225  MRI.clearKillFlags(Val->getReg());
3226 
3227  const DebugLoc &DL = MI.getDebugLoc();
3228 
3229  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3230 
3231  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3232  Offset, UseGPRIdxMode, false);
3233  MachineBasicBlock *LoopBB = InsPt->getParent();
3234 
3235  if (UseGPRIdxMode) {
3236  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3237  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3238  .add(*Val) // src0
3240  .addReg(PhiReg, RegState::Implicit)
3241  .addReg(AMDGPU::M0, RegState::Implicit);
3242  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3243  } else {
3244  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3245 
3246  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3247  .addReg(Dst, RegState::Define)
3248  .addReg(PhiReg)
3249  .add(*Val)
3250  .addImm(SubReg - AMDGPU::sub0);
3251  }
3252 
3253  MI.eraseFromParent();
3254 
3255  return LoopBB;
3256 }
3257 
3259  MachineInstr &MI, MachineBasicBlock *BB) const {
3260 
3261  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3262  MachineFunction *MF = BB->getParent();
3264 
3265  if (TII->isMIMG(MI)) {
3266  if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3267  report_fatal_error("missing mem operand from MIMG instruction");
3268  }
3269  // Add a memoperand for mimg instructions so that they aren't assumed to
3270  // be ordered memory instuctions.
3271 
3272  return BB;
3273  }
3274 
3275  switch (MI.getOpcode()) {
3276  case AMDGPU::S_ADD_U64_PSEUDO:
3277  case AMDGPU::S_SUB_U64_PSEUDO: {
3279  const DebugLoc &DL = MI.getDebugLoc();
3280 
3281  MachineOperand &Dest = MI.getOperand(0);
3282  MachineOperand &Src0 = MI.getOperand(1);
3283  MachineOperand &Src1 = MI.getOperand(2);
3284 
3285  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3286  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3287 
3288  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3289  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3290  &AMDGPU::SReg_32_XM0RegClass);
3291  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3292  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3293  &AMDGPU::SReg_32_XM0RegClass);
3294 
3295  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3296  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3297  &AMDGPU::SReg_32_XM0RegClass);
3298  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3299  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3300  &AMDGPU::SReg_32_XM0RegClass);
3301 
3302  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3303 
3304  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3305  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3306  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3307  .add(Src0Sub0)
3308  .add(Src1Sub0);
3309  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3310  .add(Src0Sub1)
3311  .add(Src1Sub1);
3312  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3313  .addReg(DestSub0)
3314  .addImm(AMDGPU::sub0)
3315  .addReg(DestSub1)
3316  .addImm(AMDGPU::sub1);
3317  MI.eraseFromParent();
3318  return BB;
3319  }
3320  case AMDGPU::SI_INIT_M0: {
3321  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3322  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3323  .add(MI.getOperand(0));
3324  MI.eraseFromParent();
3325  return BB;
3326  }
3327  case AMDGPU::SI_INIT_EXEC:
3328  // This should be before all vector instructions.
3329  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3330  AMDGPU::EXEC)
3331  .addImm(MI.getOperand(0).getImm());
3332  MI.eraseFromParent();
3333  return BB;
3334 
3335  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3336  // Extract the thread count from an SGPR input and set EXEC accordingly.
3337  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3338  //
3339  // S_BFE_U32 count, input, {shift, 7}
3340  // S_BFM_B64 exec, count, 0
3341  // S_CMP_EQ_U32 count, 64
3342  // S_CMOV_B64 exec, -1
3343  MachineInstr *FirstMI = &*BB->begin();
3345  unsigned InputReg = MI.getOperand(0).getReg();
3346  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3347  bool Found = false;
3348 
3349  // Move the COPY of the input reg to the beginning, so that we can use it.
3350  for (auto I = BB->begin(); I != &MI; I++) {
3351  if (I->getOpcode() != TargetOpcode::COPY ||
3352  I->getOperand(0).getReg() != InputReg)
3353  continue;
3354 
3355  if (I == FirstMI) {
3356  FirstMI = &*++BB->begin();
3357  } else {
3358  I->removeFromParent();
3359  BB->insert(FirstMI, &*I);
3360  }
3361  Found = true;
3362  break;
3363  }
3364  assert(Found);
3365  (void)Found;
3366 
3367  // This should be before all vector instructions.
3368  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3369  .addReg(InputReg)
3370  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3371  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3372  AMDGPU::EXEC)
3373  .addReg(CountReg)
3374  .addImm(0);
3375  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3376  .addReg(CountReg, RegState::Kill)
3377  .addImm(64);
3378  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3379  AMDGPU::EXEC)
3380  .addImm(-1);
3381  MI.eraseFromParent();
3382  return BB;
3383  }
3384 
3385  case AMDGPU::GET_GROUPSTATICSIZE: {
3386  DebugLoc DL = MI.getDebugLoc();
3387  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3388  .add(MI.getOperand(0))
3389  .addImm(MFI->getLDSSize());
3390  MI.eraseFromParent();
3391  return BB;
3392  }
3393  case AMDGPU::SI_INDIRECT_SRC_V1:
3394  case AMDGPU::SI_INDIRECT_SRC_V2:
3395  case AMDGPU::SI_INDIRECT_SRC_V4:
3396  case AMDGPU::SI_INDIRECT_SRC_V8:
3397  case AMDGPU::SI_INDIRECT_SRC_V16:
3398  return emitIndirectSrc(MI, *BB, *getSubtarget());
3399  case AMDGPU::SI_INDIRECT_DST_V1:
3400  case AMDGPU::SI_INDIRECT_DST_V2:
3401  case AMDGPU::SI_INDIRECT_DST_V4:
3402  case AMDGPU::SI_INDIRECT_DST_V8:
3403  case AMDGPU::SI_INDIRECT_DST_V16:
3404  return emitIndirectDst(MI, *BB, *getSubtarget());
3405  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3406  case AMDGPU::SI_KILL_I1_PSEUDO:
3407  return splitKillBlock(MI, BB);
3408  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3410 
3411  unsigned Dst = MI.getOperand(0).getReg();
3412  unsigned Src0 = MI.getOperand(1).getReg();
3413  unsigned Src1 = MI.getOperand(2).getReg();
3414  const DebugLoc &DL = MI.getDebugLoc();
3415  unsigned SrcCond = MI.getOperand(3).getReg();
3416 
3417  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3418  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3419  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3420 
3421  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3422  .addReg(SrcCond);
3423  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3424  .addReg(Src0, 0, AMDGPU::sub0)
3425  .addReg(Src1, 0, AMDGPU::sub0)
3426  .addReg(SrcCondCopy);
3427  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3428  .addReg(Src0, 0, AMDGPU::sub1)
3429  .addReg(Src1, 0, AMDGPU::sub1)
3430  .addReg(SrcCondCopy);
3431 
3432  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3433  .addReg(DstLo)
3434  .addImm(AMDGPU::sub0)
3435  .addReg(DstHi)
3436  .addImm(AMDGPU::sub1);
3437  MI.eraseFromParent();
3438  return BB;
3439  }
3440  case AMDGPU::SI_BR_UNDEF: {
3441  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3442  const DebugLoc &DL = MI.getDebugLoc();
3443  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3444  .add(MI.getOperand(0));
3445  Br->getOperand(1).setIsUndef(true); // read undef SCC
3446  MI.eraseFromParent();
3447  return BB;
3448  }
3449  case AMDGPU::ADJCALLSTACKUP:
3450  case AMDGPU::ADJCALLSTACKDOWN: {
3452  MachineInstrBuilder MIB(*MF, &MI);
3453 
3454  // Add an implicit use of the frame offset reg to prevent the restore copy
3455  // inserted after the call from being reorderd after stack operations in the
3456  // the caller's frame.
3457  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3458  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3459  .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3460  return BB;
3461  }
3462  case AMDGPU::SI_CALL_ISEL:
3463  case AMDGPU::SI_TCRETURN_ISEL: {
3464  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3465  const DebugLoc &DL = MI.getDebugLoc();
3466  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3467 
3469  unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3470  MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3471  assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3472 
3473  const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3474 
3475  MachineInstrBuilder MIB;
3476  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3477  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3478  .add(MI.getOperand(0))
3479  .addGlobalAddress(G);
3480  } else {
3481  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3482  .add(MI.getOperand(0))
3483  .addGlobalAddress(G);
3484 
3485  // There is an additional imm operand for tcreturn, but it should be in the
3486  // right place already.
3487  }
3488 
3489  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3490  MIB.add(MI.getOperand(I));
3491 
3492  MIB.cloneMemRefs(MI);
3493  MI.eraseFromParent();
3494  return BB;
3495  }
3496  default:
3498  }
3499 }
3500 
3502  return isTypeLegal(VT.getScalarType());
3503 }
3504 
3506  // This currently forces unfolding various combinations of fsub into fma with
3507  // free fneg'd operands. As long as we have fast FMA (controlled by
3508  // isFMAFasterThanFMulAndFAdd), we should perform these.
3509 
3510  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3511  // most of these combines appear to be cycle neutral but save on instruction
3512  // count / code size.
3513  return true;
3514 }
3515 
3517  EVT VT) const {
3518  if (!VT.isVector()) {
3519  return MVT::i1;
3520  }
3521  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3522 }
3523 
3525  // TODO: Should i16 be used always if legal? For now it would force VALU
3526  // shifts.
3527  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3528 }
3529 
3530 // Answering this is somewhat tricky and depends on the specific device which
3531 // have different rates for fma or all f64 operations.
3532 //
3533 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3534 // regardless of which device (although the number of cycles differs between
3535 // devices), so it is always profitable for f64.
3536 //
3537 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3538 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3539 // which we can always do even without fused FP ops since it returns the same
3540 // result as the separate operations and since it is always full
3541 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3542 // however does not support denormals, so we do report fma as faster if we have
3543 // a fast fma device and require denormals.
3544 //
3546  VT = VT.getScalarType();
3547 
3548  switch (VT.getSimpleVT().SimpleTy) {
3549  case MVT::f32: {
3550  // This is as fast on some subtargets. However, we always have full rate f32
3551  // mad available which returns the same result as the separate operations
3552  // which we should prefer over fma. We can't use this if we want to support
3553  // denormals, so only report this in these cases.
3554  if (Subtarget->hasFP32Denormals())
3555  return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3556 
3557  // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3558  return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3559  }
3560  case MVT::f64:
3561  return true;
3562  case MVT::f16:
3563  return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3564  default:
3565  break;
3566  }
3567 
3568  return false;
3569 }
3570 
3571 //===----------------------------------------------------------------------===//
3572 // Custom DAG Lowering Operations
3573 //===----------------------------------------------------------------------===//
3574 
3575 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3576 // wider vector type is legal.
3578  SelectionDAG &DAG) const {
3579  unsigned Opc = Op.getOpcode();
3580  EVT VT = Op.getValueType();
3581  assert(VT == MVT::v4f16);
3582 
3583  SDValue Lo, Hi;
3584  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3585 
3586  SDLoc SL(Op);
3587  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3588  Op->getFlags());
3589  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3590  Op->getFlags());
3591 
3592  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3593 }
3594 
3595 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3596 // wider vector type is legal.
3598  SelectionDAG &DAG) const {
3599  unsigned Opc = Op.getOpcode();
3600  EVT VT = Op.getValueType();
3601  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3602 
3603  SDValue Lo0, Hi0;
3604  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3605  SDValue Lo1, Hi1;
3606  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3607 
3608  SDLoc SL(Op);
3609 
3610  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3611  Op->getFlags());
3612  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3613  Op->getFlags());
3614 
3615  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3616 }
3617 
3619  switch (Op.getOpcode()) {
3620  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3621  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3622  case ISD::LOAD: {
3623  SDValue Result = LowerLOAD(Op, DAG);
3624  assert((!Result.getNode() ||
3625  Result.getNode()->getNumValues() == 2) &&
3626  "Load should return a value and a chain");
3627  return Result;
3628  }
3629 
3630  case ISD::FSIN:
3631  case ISD::FCOS:
3632  return LowerTrig(Op, DAG);
3633  case ISD::SELECT: return LowerSELECT(Op, DAG);
3634  case ISD::FDIV: return LowerFDIV(Op, DAG);
3635  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3636  case ISD::STORE: return LowerSTORE(Op, DAG);
3637  case ISD::GlobalAddress: {
3638  MachineFunction &MF = DAG.getMachineFunction();
3640  return LowerGlobalAddress(MFI, Op, DAG);
3641  }
3642  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3643  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3644  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3645  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3647  return lowerINSERT_VECTOR_ELT(Op, DAG);
3649  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3650  case ISD::BUILD_VECTOR:
3651  return lowerBUILD_VECTOR(Op, DAG);
3652  case ISD::FP_ROUND:
3653  return lowerFP_ROUND(Op, DAG);
3654  case ISD::TRAP:
3655  return lowerTRAP(Op, DAG);
3656  case ISD::DEBUGTRAP:
3657  return lowerDEBUGTRAP(Op, DAG);
3658  case ISD::FABS:
3659  case ISD::FNEG:
3660  case ISD::FCANONICALIZE:
3661  return splitUnaryVectorOp(Op, DAG);
3662  case ISD::FMINNUM:
3663  case ISD::FMAXNUM:
3664  return lowerFMINNUM_FMAXNUM(Op, DAG);
3665  case ISD::SHL:
3666  case ISD::SRA:
3667  case ISD::SRL:
3668  case ISD::ADD:
3669  case ISD::SUB:
3670  case ISD::MUL:
3671  case ISD::SMIN:
3672  case ISD::SMAX:
3673  case ISD::UMIN:
3674  case ISD::UMAX:
3675  case ISD::FADD:
3676  case ISD::FMUL:
3677  case ISD::FMINNUM_IEEE:
3678  case ISD::FMAXNUM_IEEE:
3679  return splitBinaryVectorOp(Op, DAG);
3680  }
3681  return SDValue();
3682 }
3683 
3685  const SDLoc &DL,
3686  SelectionDAG &DAG, bool Unpacked) {
3687  if (!LoadVT.isVector())
3688  return Result;
3689 
3690  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3691  // Truncate to v2i16/v4i16.
3692  EVT IntLoadVT = LoadVT.changeTypeToInteger();
3693 
3694  // Workaround legalizer not scalarizing truncate after vector op
3695  // legalization byt not creating intermediate vector trunc.
3697  DAG.ExtractVectorElements(Result, Elts);
3698  for (SDValue &Elt : Elts)
3699  Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3700 
3701  Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3702 
3703  // Bitcast to original type (v2f16/v4f16).
3704  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3705  }
3706 
3707  // Cast back to the original packed type.
3708  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3709 }
3710 
3711 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3712  MemSDNode *M,
3713  SelectionDAG &DAG,
3714  ArrayRef<SDValue> Ops,
3715  bool IsIntrinsic) const {
3716  SDLoc DL(M);
3717 
3718  bool Unpacked = Subtarget->hasUnpackedD16VMem();
3719  EVT LoadVT = M->getValueType(0);
3720 
3721  EVT EquivLoadVT = LoadVT;
3722  if (Unpacked && LoadVT.isVector()) {
3723  EquivLoadVT = LoadVT.isVector() ?
3725  LoadVT.getVectorNumElements()) : LoadVT;
3726  }
3727 
3728  // Change from v4f16/v2f16 to EquivLoadVT.
3729  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3730 
3731  SDValue Load
3732  = DAG.getMemIntrinsicNode(
3733  IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3734  VTList, Ops, M->getMemoryVT(),
3735  M->getMemOperand());
3736  if (!Unpacked) // Just adjusted the opcode.
3737  return Load;
3738 
3739  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3740 
3741  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3742 }
3743 
3745  SDNode *N, SelectionDAG &DAG) {
3746  EVT VT = N->getValueType(0);
3747  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3748  if (!CD)
3749  return DAG.getUNDEF(VT);
3750 
3751  int CondCode = CD->getSExtValue();
3752  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3753  CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3754  return DAG.getUNDEF(VT);
3755 
3756  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3757 
3758 
3759  SDValue LHS = N->getOperand(1);
3760  SDValue RHS = N->getOperand(2);
3761 
3762  SDLoc DL(N);
3763 
3764  EVT CmpVT = LHS.getValueType();
3765  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3766  unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3768  LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3769  RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3770  }
3771 
3772  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3773 
3774  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3775  DAG.getCondCode(CCOpcode));
3776 }
3777 
3779  SDNode *N, SelectionDAG &DAG) {
3780  EVT VT = N->getValueType(0);
3781  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3782  if (!CD)
3783  return DAG.getUNDEF(VT);
3784 
3785  int CondCode = CD->getSExtValue();
3786  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3787  CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3788  return DAG.getUNDEF(VT);
3789  }
3790 
3791  SDValue Src0 = N->getOperand(1);
3792  SDValue Src1 = N->getOperand(2);
3793  EVT CmpVT = Src0.getValueType();
3794  SDLoc SL(N);
3795 
3796  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3797  Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3798  Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3799  }
3800 
3801  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3802  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3803  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3804  Src1, DAG.getCondCode(CCOpcode));
3805 }
3806 
3809  SelectionDAG &DAG) const {
3810  switch (N->getOpcode()) {
3811  case ISD::INSERT_VECTOR_ELT: {
3812  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3813  Results.push_back(Res);
3814  return;
3815  }
3816  case ISD::EXTRACT_VECTOR_ELT: {
3817  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3818  Results.push_back(Res);
3819  return;
3820  }
3821  case ISD::INTRINSIC_WO_CHAIN: {
3822  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3823  switch (IID) {
3824  case Intrinsic::amdgcn_cvt_pkrtz: {
3825  SDValue Src0 = N->getOperand(1);
3826  SDValue Src1 = N->getOperand(2);
3827  SDLoc SL(N);
3829  Src0, Src1);
3830  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3831  return;
3832  }
3833  case Intrinsic::amdgcn_cvt_pknorm_i16:
3834  case Intrinsic::amdgcn_cvt_pknorm_u16:
3835  case Intrinsic::amdgcn_cvt_pk_i16:
3836  case Intrinsic::amdgcn_cvt_pk_u16: {
3837  SDValue Src0 = N->getOperand(1);
3838  SDValue Src1 = N->getOperand(2);
3839  SDLoc SL(N);
3840  unsigned Opcode;
3841 
3842  if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3844  else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3846  else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3847  Opcode = AMDGPUISD::CVT_PK_I16_I32;
3848  else
3849  Opcode = AMDGPUISD::CVT_PK_U16_U32;
3850 
3851  EVT VT = N->getValueType(0);
3852  if (isTypeLegal(VT))
3853  Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3854  else {
3855  SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3856  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3857  }
3858  return;
3859  }
3860  }
3861  break;
3862  }
3863  case ISD::INTRINSIC_W_CHAIN: {
3864  if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3865  Results.push_back(Res);
3866  Results.push_back(Res.getValue(1));
3867  return;
3868  }
3869 
3870  break;
3871  }
3872  case ISD::SELECT: {
3873  SDLoc SL(N);
3874  EVT VT = N->getValueType(0);
3875  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3876  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3877  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3878 
3879  EVT SelectVT = NewVT;
3880  if (NewVT.bitsLT(MVT::i32)) {
3881  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3882  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3883  SelectVT = MVT::i32;
3884  }
3885 
3886  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3887  N->getOperand(0), LHS, RHS);
3888 
3889  if (NewVT != SelectVT)
3890  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3891  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3892  return;
3893  }
3894  case ISD::FNEG: {
3895  if (N->getValueType(0) != MVT::v2f16)
3896  break;
3897 
3898  SDLoc SL(N);
3899  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3900 
3901  SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3902  BC,
3903  DAG.getConstant(0x80008000, SL, MVT::i32));
3904  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3905  return;
3906  }
3907  case ISD::FABS: {
3908  if (N->getValueType(0) != MVT::v2f16)
3909  break;
3910 
3911  SDLoc SL(N);
3912  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3913 
3914  SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3915  BC,
3916  DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3917  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3918  return;
3919  }
3920  default:
3921  break;
3922  }
3923 }
3924 
3925 /// Helper function for LowerBRCOND
3926 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3927 
3928  SDNode *Parent = Value.getNode();
3929  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3930  I != E; ++I) {
3931 
3932  if (I.getUse().get() != Value)
3933  continue;
3934 
3935  if (I->getOpcode() == Opcode)
3936  return *I;
3937  }
3938  return nullptr;
3939 }
3940 
3941 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3942  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3943  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3944  case Intrinsic::amdgcn_if:
3945  return AMDGPUISD::IF;
3946  case Intrinsic::amdgcn_else:
3947  return AMDGPUISD::ELSE;
3948  case Intrinsic::amdgcn_loop:
3949  return AMDGPUISD::LOOP;
3950  case Intrinsic::amdgcn_end_cf:
3951  llvm_unreachable("should not occur");
3952  default:
3953  return 0;
3954  }
3955  }
3956 
3957  // break, if_break, else_break are all only used as inputs to loop, not
3958  // directly as branch conditions.
3959  return 0;
3960 }
3961 
3962 void SITargetLowering::createDebuggerPrologueStackObjects(
3963  MachineFunction &MF) const {
3964  // Create stack objects that are used for emitting debugger prologue.
3965  //
3966  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3967  // at fixed location in the following format:
3968  // offset 0: work group ID x
3969  // offset 4: work group ID y
3970  // offset 8: work group ID z
3971  // offset 16: work item ID x
3972  // offset 20: work item ID y
3973  // offset 24: work item ID z
3975  int ObjectIdx = 0;
3976 
3977  // For each dimension:
3978  for (unsigned i = 0; i < 3; ++i) {
3979  // Create fixed stack object for work group ID.
3980  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3981  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3982  // Create fixed stack object for work item ID.
3983  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3984  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3985  }
3986 }
3987 
3988 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3989  const Triple &TT = getTargetMachine().getTargetTriple();
3990  return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3993 }
3994 
3995 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3996  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
3999  !shouldEmitFixup(GV) &&
4001 }
4002 
4003 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4004  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4005 }
4006 
4007 /// This transforms the control flow intrinsics to get the branch destination as
4008 /// last parameter, also switches branch target with BR if the need arise
4009 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4010  SelectionDAG &DAG) const {
4011  SDLoc DL(BRCOND);
4012 
4013  SDNode *Intr = BRCOND.getOperand(1).getNode();
4014  SDValue Target = BRCOND.getOperand(2);
4015  SDNode *BR = nullptr;
4016  SDNode *SetCC = nullptr;
4017 
4018  if (Intr->getOpcode() == ISD::SETCC) {
4019  // As long as we negate the condition everything is fine
4020  SetCC = Intr;
4021  Intr = SetCC->getOperand(0).getNode();
4022 
4023  } else {
4024  // Get the target from BR if we don't negate the condition
4025  BR = findUser(BRCOND, ISD::BR);
4026  Target = BR->getOperand(1);
4027  }
4028 
4029  // FIXME: This changes the types of the intrinsics instead of introducing new
4030  // nodes with the correct types.
4031  // e.g. llvm.amdgcn.loop
4032 
4033  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4034  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4035 
4036  unsigned CFNode = isCFIntrinsic(Intr);
4037  if (CFNode == 0) {
4038  // This is a uniform branch so we don't need to legalize.
4039  return BRCOND;
4040  }
4041 
4042  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4043  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4044 
4045  assert(!SetCC ||
4046  (SetCC->getConstantOperandVal(1) == 1 &&
4047  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4048  ISD::SETNE));
4049 
4050  // operands of the new intrinsic call
4052  if (HaveChain)
4053  Ops.push_back(BRCOND.getOperand(0));
4054 
4055  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
4056  Ops.push_back(Target);
4057 
4058  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4059 
4060  // build the new intrinsic call
4061  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4062 
4063  if (!HaveChain) {
4064  SDValue Ops[] = {
4065  SDValue(Result, 0),
4066  BRCOND.getOperand(0)
4067  };
4068 
4069  Result = DAG.getMergeValues(Ops, DL).getNode();
4070  }
4071 
4072  if (BR) {
4073  // Give the branch instruction our target
4074  SDValue Ops[] = {
4075  BR->getOperand(0),
4076  BRCOND.getOperand(2)
4077  };
4078  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4079  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4080  BR = NewBR.getNode();
4081  }
4082 
4083  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4084 
4085  // Copy the intrinsic results to registers
4086  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4088  if (!CopyToReg)
4089  continue;
4090 
4091  Chain = DAG.getCopyToReg(
4092  Chain, DL,
4093  CopyToReg->getOperand(1),
4094  SDValue(Result, i - 1),
4095  SDValue());
4096 
4097  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4098  }
4099 
4100  // Remove the old intrinsic from the chain
4102  SDValue(Intr, Intr->getNumValues() - 1),
4103  Intr->getOperand(0));
4104 
4105  return Chain;
4106 }
4107 
4108 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4109  SDValue Op,
4110  const SDLoc &DL,
4111  EVT VT) const {
4112  return Op.getValueType().bitsLE(VT) ?
4113  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4114  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4115 }
4116 
4117 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4118  assert(Op.getValueType() == MVT::f16 &&
4119  "Do not know how to custom lower FP_ROUND for non-f16 type");
4120 
4121  SDValue Src = Op.getOperand(0);
4122  EVT SrcVT = Src.getValueType();
4123  if (SrcVT != MVT::f64)
4124  return Op;
4125 
4126  SDLoc DL(Op);
4127 
4128  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4129  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4130  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4131 }
4132 
4133 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4134  SelectionDAG &DAG) const {
4135  EVT VT = Op.getValueType();
4136  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4137 
4138  // FIXME: Assert during eslection that this is only selected for
4139  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4140  // mode functions, but this happens to be OK since it's only done in cases
4141  // where there is known no sNaN.
4142  if (IsIEEEMode)
4143  return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4144 
4145  if (VT == MVT::v4f16)
4146  return splitBinaryVectorOp(Op, DAG);
4147  return Op;
4148 }
4149 
4150 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4151  SDLoc SL(Op);
4152  SDValue Chain = Op.getOperand(0);
4153 
4154  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4155  !Subtarget->isTrapHandlerEnabled())
4156  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4157 
4158  MachineFunction &MF = DAG.getMachineFunction();
4160  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4161  assert(UserSGPR != AMDGPU::NoRegister);
4162  SDValue QueuePtr = CreateLiveInRegister(
4163  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4164  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4165  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4166  QueuePtr, SDValue());
4167  SDValue Ops[] = {
4168  ToReg,
4170  SGPR01,
4171  ToReg.getValue(1)
4172  };
4173  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4174 }
4175 
4176 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4177  SDLoc SL(Op);
4178  SDValue Chain = Op.getOperand(0);
4179  MachineFunction &MF = DAG.getMachineFunction();
4180 
4181  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4182  !Subtarget->isTrapHandlerEnabled()) {
4184  "debugtrap handler not supported",
4185  Op.getDebugLoc(),
4186  DS_Warning);
4187  LLVMContext &Ctx = MF.getFunction().getContext();
4188  Ctx.diagnose(NoTrap);
4189  return Chain;
4190  }
4191 
4192  SDValue Ops[] = {
4193  Chain,
4195  };
4196  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4197 }
4198 
4199 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4200  SelectionDAG &DAG) const {
4201  // FIXME: Use inline constants (src_{shared, private}_base) instead.
4202  if (Subtarget->hasApertureRegs()) {
4203  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4206  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4209  unsigned Encoding =
4211  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4212  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4213 
4214  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4215  SDValue ApertureReg = SDValue(
4216  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4217  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4218  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4219  }
4220 
4221  MachineFunction &MF = DAG.getMachineFunction();
4223  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4224  assert(UserSGPR != AMDGPU::NoRegister);
4225 
4226  SDValue QueuePtr = CreateLiveInRegister(
4227  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4228 
4229  // Offset into amd_queue_t for group_segment_aperture_base_hi /
4230  // private_segment_aperture_base_hi.
4231  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4232 
4233  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4234 
4235  // TODO: Use custom target PseudoSourceValue.
4236  // TODO: We should use the value from the IR intrinsic call, but it might not
4237  // be available and how do we get it?
4240 
4241  MachinePointerInfo PtrInfo(V, StructOffset);
4242  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4243  MinAlign(64, StructOffset),
4246 }
4247 
4248 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4249  SelectionDAG &DAG) const {
4250  SDLoc SL(Op);
4251  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4252 
4253  SDValue Src = ASC->getOperand(0);
4254  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4255 
4256  const AMDGPUTargetMachine &TM =
4257  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4258 
4259  // flat -> local/private
4261  unsigned DestAS = ASC->getDestAddressSpace();
4262 
4263  if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4264  DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4265  unsigned NullVal = TM.getNullPointerValue(DestAS);
4266  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4267  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4268  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4269 
4270  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4271  NonNull, Ptr, SegmentNullPtr);
4272  }
4273  }
4274 
4275  // local/private -> flat
4277  unsigned SrcAS = ASC->getSrcAddressSpace();
4278 
4279  if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4280  SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4281  unsigned NullVal = TM.getNullPointerValue(SrcAS);
4282  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4283 
4284  SDValue NonNull
4285  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4286 
4287  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4288  SDValue CvtPtr
4289  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4290 
4291  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4292  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4293  FlatNullPtr);
4294  }
4295  }
4296 
4297  // global <-> flat are no-ops and never emitted.
4298 
4299  const MachineFunction &MF = DAG.getMachineFunction();
4300  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4301  MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4302  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4303 
4304  return DAG.getUNDEF(ASC->getValueType(0));
4305 }
4306 
4307 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4308  SelectionDAG &DAG) const {
4309  SDValue Vec = Op.getOperand(0);
4310  SDValue InsVal = Op.getOperand(1);
4311  SDValue Idx = Op.getOperand(2);
4312  EVT VecVT = Vec.getValueType();
4313  EVT EltVT = VecVT.getVectorElementType();
4314  unsigned VecSize = VecVT.getSizeInBits();
4315  unsigned EltSize = EltVT.getSizeInBits();
4316 
4317 
4318  assert(VecSize <= 64);
4319 
4320  unsigned NumElts = VecVT.getVectorNumElements();
4321  SDLoc SL(Op);
4322  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4323 
4324  if (NumElts == 4 && EltSize == 16 && KIdx) {
4325  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4326 
4327  SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4328  DAG.getConstant(0, SL, MVT::i32));
4329  SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4330  DAG.getConstant(1, SL, MVT::i32));
4331 
4332  SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4333  SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4334 
4335  unsigned Idx = KIdx->getZExtValue();
4336  bool InsertLo = Idx < 2;
4337  SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4338  InsertLo ? LoVec : HiVec,
4339  DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4340  DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4341 
4342  InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4343 
4344  SDValue Concat = InsertLo ?
4345  DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4346  DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4347 
4348  return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4349  }
4350 
4351  if (isa<ConstantSDNode>(Idx))
4352  return SDValue();
4353 
4354  MVT IntVT = MVT::getIntegerVT(VecSize);
4355 
4356  // Avoid stack access for dynamic indexing.
4357  SDValue Val = InsVal;
4358  if (InsVal.getValueType() == MVT::f16)
4359  Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4360 
4361  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4362  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4363 
4364  assert(isPowerOf2_32(EltSize));
4365  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4366 
4367  // Convert vector index to bit-index.
4368  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4369 
4370  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4371  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4372  DAG.getConstant(0xffff, SL, IntVT),
4373  ScaledIdx);
4374 
4375  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4376  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4377  DAG.getNOT(SL, BFM, IntVT), BCVec);
4378 
4379  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4380  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4381 }
4382 
4383 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4384  SelectionDAG &DAG) const {
4385  SDLoc SL(Op);
4386 
4387  EVT ResultVT = Op.getValueType();
4388  SDValue Vec = Op.getOperand(0);
4389  SDValue Idx = Op.getOperand(1);
4390  EVT VecVT = Vec.getValueType();
4391  unsigned VecSize = VecVT.getSizeInBits();
4392  EVT EltVT = VecVT.getVectorElementType();
4393  assert(VecSize <= 64);
4394 
4395  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4396 
4397  // Make sure we do any optimizations that will make it easier to fold
4398  // source modifiers before obscuring it with bit operations.
4399 
4400  // XXX - Why doesn't this get called when vector_shuffle is expanded?
4401  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4402  return Combined;
4403 
4404  unsigned EltSize = EltVT.getSizeInBits();
4405  assert(isPowerOf2_32(EltSize));
4406 
4407  MVT IntVT = MVT::getIntegerVT(VecSize);
4408  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4409 
4410  // Convert vector index to bit-index (* EltSize)
4411  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4412 
4413  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4414  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4415 
4416  if (ResultVT == MVT::f16) {
4417  SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4418  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4419  }
4420 
4421  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4422 }
4423 
4424 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4425  SelectionDAG &DAG) const {
4426  SDLoc SL(Op);
4427  EVT VT = Op.getValueType();
4428 
4429  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4431 
4432  // Turn into pair of packed build_vectors.
4433  // TODO: Special case for constants that can be materialized with s_mov_b64.
4434  SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4435  { Op.getOperand(0), Op.getOperand(1) });
4436  SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4437  { Op.getOperand(2), Op.getOperand(3) });
4438 
4439  SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4440  SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4441 
4442  SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4443  return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4444  }
4445 
4446  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4447  assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
4448 
4449