LLVM  9.0.0svn
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // Provide M_PI.
16 #define _USE_MATH_DEFINES
17 #endif
18 
19 #include "SIISelLowering.h"
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
71 #include "llvm/Support/Compiler.h"
73 #include "llvm/Support/KnownBits.h"
77 #include <cassert>
78 #include <cmath>
79 #include <cstdint>
80 #include <iterator>
81 #include <tuple>
82 #include <utility>
83 #include <vector>
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-lower"
88 
89 STATISTIC(NumTailCalls, "Number of tail calls");
90 
92  "amdgpu-vgpr-index-mode",
93  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94  cl::init(false));
95 
97  "amdgpu-frame-index-zero-bits",
98  cl::desc("High bits of frame index assumed to be zero"),
99  cl::init(5),
101 
102 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
103  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
104  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
105  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
106  return AMDGPU::SGPR0 + Reg;
107  }
108  }
109  llvm_unreachable("Cannot allocate sgpr");
110 }
111 
113  const GCNSubtarget &STI)
114  : AMDGPUTargetLowering(TM, STI),
115  Subtarget(&STI) {
116  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
117  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
118 
119  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
120  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
121 
122  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
123  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
124  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
125 
126  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
127  addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
128 
129  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
130  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
131 
132  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
133  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
134 
135  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
136  addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
137 
138  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
139  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
140 
141  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
142  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
143 
144  if (Subtarget->has16BitInsts()) {
145  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
146  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
147 
148  // Unless there are also VOP3P operations, not operations are really legal.
149  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
150  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
151  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
152  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
153  }
154 
156 
157  // We need to custom lower vector stores from local memory
166 
175 
186 
189 
194 
200 
205 
208 
216 
224 
231 
238 
245 
248 
251 
255 
256 #if 0
259 #endif
260 
261  // We only support LOAD/STORE and vector manipulation ops for vectors
262  // with > 4 elements.
265  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
266  switch (Op) {
267  case ISD::LOAD:
268  case ISD::STORE:
269  case ISD::BUILD_VECTOR:
270  case ISD::BITCAST:
276  break;
277  case ISD::CONCAT_VECTORS:
279  break;
280  default:
282  break;
283  }
284  }
285  }
286 
288 
289  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
290  // is expanded to avoid having two separate loops in case the index is a VGPR.
291 
292  // Most operations are naturally 32-bit vector operations. We only support
293  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
294  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
297 
300 
303 
306  }
307 
312 
315 
316  // Avoid stack access for these.
317  // TODO: Generalize to more vector types.
322 
328 
332 
337 
338  // Deal with vec3 vector operations when widened to vec4.
343 
344  // Deal with vec5 vector operations when widened to vec8.
349 
350  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
351  // and output demarshalling
354 
355  // We can't return success/failure, only the old value,
356  // let LLVM add the comparison
359 
360  if (Subtarget->hasFlatAddressSpace()) {
363  }
364 
367 
368  // On SI this is s_memtime and s_memrealtime on VI.
372 
373  if (Subtarget->has16BitInsts()) {
377  }
378 
379  // v_mad_f32 does not support denormals according to some sources.
380  if (!Subtarget->hasFP32Denormals())
382 
383  if (!Subtarget->hasBFI()) {
384  // fcopysign can be done in a single instruction with BFI.
387  }
388 
389  if (!Subtarget->hasBCNT(32))
391 
392  if (!Subtarget->hasBCNT(64))
394 
395  if (Subtarget->hasFFBH())
397 
398  if (Subtarget->hasFFBL())
400 
401  // We only really have 32-bit BFE instructions (and 16-bit on VI).
402  //
403  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
404  // effort to match them now. We want this to be false for i64 cases when the
405  // extraction isn't restricted to the upper or lower half. Ideally we would
406  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
407  // span the midpoint are probably relatively rare, so don't worry about them
408  // for now.
409  if (Subtarget->hasBFE())
410  setHasExtractBitsInsn(true);
411 
416 
417 
418  // These are really only legal for ieee_mode functions. We should be avoiding
419  // them for functions that don't have ieee_mode enabled, so just say they are
420  // legal.
425 
426 
427  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
431  } else {
436  }
437 
439 
444 
445  if (Subtarget->has16BitInsts()) {
447 
450 
453 
456 
459 
464 
467 
473 
475 
477 
479 
481 
486 
491 
492  // F16 - Constant Actions.
494 
495  // F16 - Load/Store Actions.
500 
501  // F16 - VOP1 Actions.
510 
511  // F16 - VOP2 Actions.
514 
516 
517  // F16 - VOP3 Actions.
519  if (!Subtarget->hasFP16Denormals())
521 
522  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
523  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
524  switch (Op) {
525  case ISD::LOAD:
526  case ISD::STORE:
527  case ISD::BUILD_VECTOR:
528  case ISD::BITCAST:
534  break;
535  case ISD::CONCAT_VECTORS:
537  break;
538  default:
540  break;
541  }
542  }
543  }
544 
545  // XXX - Do these do anything? Vector constants turn into build_vector.
548 
551 
556 
561 
568 
573 
578 
583 
587 
588  if (!Subtarget->hasVOP3PInsts()) {
591  }
592 
594  // This isn't really legal, but this avoids the legalizer unrolling it (and
595  // allows matching fneg (fabs x) patterns)
597 
602 
605 
608  }
609 
610  if (Subtarget->hasVOP3PInsts()) {
621 
625 
628 
630 
633 
640 
645 
648 
651 
655 
659  }
660 
663 
664  if (Subtarget->has16BitInsts()) {
669  } else {
670  // Legalization hack.
673 
676  }
677 
680  }
681 
709 
710  // All memory operations. Some folding on the pointer operand is done to help
711  // matching the constant offsets in the addressing modes.
730 
732 
733  // SI at least has hardware support for floating point exceptions, but no way
734  // of using or handling them is implemented. They are also optional in OpenCL
735  // (Section 7.3)
737 }
738 
740  return Subtarget;
741 }
742 
743 //===----------------------------------------------------------------------===//
744 // TargetLowering queries
745 //===----------------------------------------------------------------------===//
746 
747 // v_mad_mix* support a conversion from f16 to f32.
748 //
749 // There is only one special case when denormals are enabled we don't currently,
750 // where this is OK to use.
752  EVT DestVT, EVT SrcVT) const {
753  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
754  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
755  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
756  SrcVT.getScalarType() == MVT::f16;
757 }
758 
760  // SI has some legal vector types, but no legal vector operations. Say no
761  // shuffles are legal in order to prefer scalarizing some vector operations.
762  return false;
763 }
764 
766  CallingConv::ID CC,
767  EVT VT) const {
768  // TODO: Consider splitting all arguments into 32-bit pieces.
769  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
770  EVT ScalarVT = VT.getScalarType();
771  unsigned Size = ScalarVT.getSizeInBits();
772  if (Size == 32)
773  return ScalarVT.getSimpleVT();
774 
775  if (Size == 64)
776  return MVT::i32;
777 
778  if (Size == 16 && Subtarget->has16BitInsts())
779  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
780  }
781 
782  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
783 }
784 
786  CallingConv::ID CC,
787  EVT VT) const {
788  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
789  unsigned NumElts = VT.getVectorNumElements();
790  EVT ScalarVT = VT.getScalarType();
791  unsigned Size = ScalarVT.getSizeInBits();
792 
793  if (Size == 32)
794  return NumElts;
795 
796  if (Size == 64)
797  return 2 * NumElts;
798 
799  if (Size == 16 && Subtarget->has16BitInsts())
800  return (VT.getVectorNumElements() + 1) / 2;
801  }
802 
803  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
804 }
805 
808  EVT VT, EVT &IntermediateVT,
809  unsigned &NumIntermediates, MVT &RegisterVT) const {
810  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
811  unsigned NumElts = VT.getVectorNumElements();
812  EVT ScalarVT = VT.getScalarType();
813  unsigned Size = ScalarVT.getSizeInBits();
814  if (Size == 32) {
815  RegisterVT = ScalarVT.getSimpleVT();
816  IntermediateVT = RegisterVT;
817  NumIntermediates = NumElts;
818  return NumIntermediates;
819  }
820 
821  if (Size == 64) {
822  RegisterVT = MVT::i32;
823  IntermediateVT = RegisterVT;
824  NumIntermediates = 2 * NumElts;
825  return NumIntermediates;
826  }
827 
828  // FIXME: We should fix the ABI to be the same on targets without 16-bit
829  // support, but unless we can properly handle 3-vectors, it will be still be
830  // inconsistent.
831  if (Size == 16 && Subtarget->has16BitInsts()) {
832  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
833  IntermediateVT = RegisterVT;
834  NumIntermediates = (NumElts + 1) / 2;
835  return NumIntermediates;
836  }
837  }
838 
840  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
841 }
842 
844  // Only limited forms of aggregate type currently expected.
845  assert(Ty->isStructTy() && "Expected struct type");
846 
847 
848  Type *ElementType = nullptr;
849  unsigned NumElts;
850  if (Ty->getContainedType(0)->isVectorTy()) {
851  VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
852  ElementType = VecComponent->getElementType();
853  NumElts = VecComponent->getNumElements();
854  } else {
855  ElementType = Ty->getContainedType(0);
856  NumElts = 1;
857  }
858 
859  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
860 
861  // Calculate the size of the memVT type from the aggregate
862  unsigned Pow2Elts = 0;
863  unsigned ElementSize;
864  switch (ElementType->getTypeID()) {
865  default:
866  llvm_unreachable("Unknown type!");
867  case Type::IntegerTyID:
868  ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
869  break;
870  case Type::HalfTyID:
871  ElementSize = 16;
872  break;
873  case Type::FloatTyID:
874  ElementSize = 32;
875  break;
876  }
877  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
878  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
879 
880  return MVT::getVectorVT(MVT::getVT(ElementType, false),
881  Pow2Elts);
882 }
883 
885  const CallInst &CI,
886  MachineFunction &MF,
887  unsigned IntrID) const {
888  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
889  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
891  (Intrinsic::ID)IntrID);
892  if (Attr.hasFnAttribute(Attribute::ReadNone))
893  return false;
894 
896 
897  if (RsrcIntr->IsImage) {
898  Info.ptrVal = MFI->getImagePSV(
900  CI.getArgOperand(RsrcIntr->RsrcArg));
901  Info.align = 0;
902  } else {
903  Info.ptrVal = MFI->getBufferPSV(
905  CI.getArgOperand(RsrcIntr->RsrcArg));
906  }
907 
909  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
911  Info.memVT = MVT::getVT(CI.getType(), true);
912  if (Info.memVT == MVT::Other) {
913  // Some intrinsics return an aggregate type - special case to work out
914  // the correct memVT
915  Info.memVT = memVTFromAggregate(CI.getType());
916  }
918  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
919  Info.opc = ISD::INTRINSIC_VOID;
920  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
922  } else {
923  // Atomic
925  Info.memVT = MVT::getVT(CI.getType());
929 
930  // XXX - Should this be volatile without known ordering?
932  }
933  return true;
934  }
935 
936  switch (IntrID) {
937  case Intrinsic::amdgcn_atomic_inc:
938  case Intrinsic::amdgcn_atomic_dec:
939  case Intrinsic::amdgcn_ds_ordered_add:
940  case Intrinsic::amdgcn_ds_ordered_swap:
941  case Intrinsic::amdgcn_ds_fadd:
942  case Intrinsic::amdgcn_ds_fmin:
943  case Intrinsic::amdgcn_ds_fmax: {
945  Info.memVT = MVT::getVT(CI.getType());
946  Info.ptrVal = CI.getOperand(0);
947  Info.align = 0;
949 
950  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
951  if (!Vol->isZero())
953 
954  return true;
955  }
956  case Intrinsic::amdgcn_ds_append:
957  case Intrinsic::amdgcn_ds_consume: {
959  Info.memVT = MVT::getVT(CI.getType());
960  Info.ptrVal = CI.getOperand(0);
961  Info.align = 0;
963 
964  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
965  if (!Vol->isZero())
967 
968  return true;
969  }
970  default:
971  return false;
972  }
973 }
974 
977  Type *&AccessTy) const {
978  switch (II->getIntrinsicID()) {
979  case Intrinsic::amdgcn_atomic_inc:
980  case Intrinsic::amdgcn_atomic_dec:
981  case Intrinsic::amdgcn_ds_ordered_add:
982  case Intrinsic::amdgcn_ds_ordered_swap:
983  case Intrinsic::amdgcn_ds_fadd:
984  case Intrinsic::amdgcn_ds_fmin:
985  case Intrinsic::amdgcn_ds_fmax: {
986  Value *Ptr = II->getArgOperand(0);
987  AccessTy = II->getType();
988  Ops.push_back(Ptr);
989  return true;
990  }
991  default:
992  return false;
993  }
994 }
995 
996 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
997  if (!Subtarget->hasFlatInstOffsets()) {
998  // Flat instructions do not have offsets, and only have the register
999  // address.
1000  return AM.BaseOffs == 0 && AM.Scale == 0;
1001  }
1002 
1003  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1004  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1005 
1006  // Just r + i
1007  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
1008 }
1009 
1011  if (Subtarget->hasFlatGlobalInsts())
1012  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
1013 
1014  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1015  // Assume the we will use FLAT for all global memory accesses
1016  // on VI.
1017  // FIXME: This assumption is currently wrong. On VI we still use
1018  // MUBUF instructions for the r + i addressing mode. As currently
1019  // implemented, the MUBUF instructions only work on buffer < 4GB.
1020  // It may be possible to support > 4GB buffers with MUBUF instructions,
1021  // by setting the stride value in the resource descriptor which would
1022  // increase the size limit to (stride * 4GB). However, this is risky,
1023  // because it has never been validated.
1024  return isLegalFlatAddressingMode(AM);
1025  }
1026 
1027  return isLegalMUBUFAddressingMode(AM);
1028 }
1029 
1030 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1031  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1032  // additionally can do r + r + i with addr64. 32-bit has more addressing
1033  // mode options. Depending on the resource constant, it can also do
1034  // (i64 r0) + (i32 r1) * (i14 i).
1035  //
1036  // Private arrays end up using a scratch buffer most of the time, so also
1037  // assume those use MUBUF instructions. Scratch loads / stores are currently
1038  // implemented as mubuf instructions with offen bit set, so slightly
1039  // different than the normal addr64.
1040  if (!isUInt<12>(AM.BaseOffs))
1041  return false;
1042 
1043  // FIXME: Since we can split immediate into soffset and immediate offset,
1044  // would it make sense to allow any immediate?
1045 
1046  switch (AM.Scale) {
1047  case 0: // r + i or just i, depending on HasBaseReg.
1048  return true;
1049  case 1:
1050  return true; // We have r + r or r + i.
1051  case 2:
1052  if (AM.HasBaseReg) {
1053  // Reject 2 * r + r.
1054  return false;
1055  }
1056 
1057  // Allow 2 * r as r + r
1058  // Or 2 * r + i is allowed as r + r + i.
1059  return true;
1060  default: // Don't allow n * r
1061  return false;
1062  }
1063 }
1064 
1066  const AddrMode &AM, Type *Ty,
1067  unsigned AS, Instruction *I) const {
1068  // No global is ever allowed as a base.
1069  if (AM.BaseGV)
1070  return false;
1071 
1072  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1073  return isLegalGlobalAddressingMode(AM);
1074 
1075  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1078  // If the offset isn't a multiple of 4, it probably isn't going to be
1079  // correctly aligned.
1080  // FIXME: Can we get the real alignment here?
1081  if (AM.BaseOffs % 4 != 0)
1082  return isLegalMUBUFAddressingMode(AM);
1083 
1084  // There are no SMRD extloads, so if we have to do a small type access we
1085  // will use a MUBUF load.
1086  // FIXME?: We also need to do this if unaligned, but we don't know the
1087  // alignment here.
1088  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1089  return isLegalGlobalAddressingMode(AM);
1090 
1091  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1092  // SMRD instructions have an 8-bit, dword offset on SI.
1093  if (!isUInt<8>(AM.BaseOffs / 4))
1094  return false;
1095  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1096  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1097  // in 8-bits, it can use a smaller encoding.
1098  if (!isUInt<32>(AM.BaseOffs / 4))
1099  return false;
1100  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1101  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1102  if (!isUInt<20>(AM.BaseOffs))
1103  return false;
1104  } else
1105  llvm_unreachable("unhandled generation");
1106 
1107  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1108  return true;
1109 
1110  if (AM.Scale == 1 && AM.HasBaseReg)
1111  return true;
1112 
1113  return false;
1114 
1115  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1116  return isLegalMUBUFAddressingMode(AM);
1117  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1118  AS == AMDGPUAS::REGION_ADDRESS) {
1119  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1120  // field.
1121  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1122  // an 8-bit dword offset but we don't know the alignment here.
1123  if (!isUInt<16>(AM.BaseOffs))
1124  return false;
1125 
1126  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1127  return true;
1128 
1129  if (AM.Scale == 1 && AM.HasBaseReg)
1130  return true;
1131 
1132  return false;
1133  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1135  // For an unknown address space, this usually means that this is for some
1136  // reason being used for pure arithmetic, and not based on some addressing
1137  // computation. We don't have instructions that compute pointers with any
1138  // addressing modes, so treat them as having no offset like flat
1139  // instructions.
1140  return isLegalFlatAddressingMode(AM);
1141  } else {
1142  llvm_unreachable("unhandled address space");
1143  }
1144 }
1145 
1146 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1147  const SelectionDAG &DAG) const {
1148  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1149  return (MemVT.getSizeInBits() <= 4 * 32);
1150  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1151  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1152  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1153  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1154  return (MemVT.getSizeInBits() <= 2 * 32);
1155  }
1156  return true;
1157 }
1158 
1160  unsigned AddrSpace,
1161  unsigned Align,
1162  bool *IsFast) const {
1163  if (IsFast)
1164  *IsFast = false;
1165 
1166  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1167  // which isn't a simple VT.
1168  // Until MVT is extended to handle this, simply check for the size and
1169  // rely on the condition below: allow accesses if the size is a multiple of 4.
1170  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1171  VT.getStoreSize() > 16)) {
1172  return false;
1173  }
1174 
1175  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1176  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1177  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1178  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1179  // with adjacent offsets.
1180  bool AlignedBy4 = (Align % 4 == 0);
1181  if (IsFast)
1182  *IsFast = AlignedBy4;
1183 
1184  return AlignedBy4;
1185  }
1186 
1187  // FIXME: We have to be conservative here and assume that flat operations
1188  // will access scratch. If we had access to the IR function, then we
1189  // could determine if any private memory was used in the function.
1190  if (!Subtarget->hasUnalignedScratchAccess() &&
1191  (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1192  AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1193  bool AlignedBy4 = Align >= 4;
1194  if (IsFast)
1195  *IsFast = AlignedBy4;
1196 
1197  return AlignedBy4;
1198  }
1199 
1200  if (Subtarget->hasUnalignedBufferAccess()) {
1201  // If we have an uniform constant load, it still requires using a slow
1202  // buffer instruction if unaligned.
1203  if (IsFast) {
1204  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1205  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1206  (Align % 4 == 0) : true;
1207  }
1208 
1209  return true;
1210  }
1211 
1212  // Smaller than dword value must be aligned.
1213  if (VT.bitsLT(MVT::i32))
1214  return false;
1215 
1216  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1217  // byte-address are ignored, thus forcing Dword alignment.
1218  // This applies to private, global, and constant memory.
1219  if (IsFast)
1220  *IsFast = true;
1221 
1222  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1223 }
1224 
1225 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1226  unsigned SrcAlign, bool IsMemset,
1227  bool ZeroMemset,
1228  bool MemcpyStrSrc,
1229  MachineFunction &MF) const {
1230  // FIXME: Should account for address space here.
1231 
1232  // The default fallback uses the private pointer size as a guess for a type to
1233  // use. Make sure we switch these to 64-bit accesses.
1234 
1235  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1236  return MVT::v4i32;
1237 
1238  if (Size >= 8 && DstAlign >= 4)
1239  return MVT::v2i32;
1240 
1241  // Use the default.
1242  return MVT::Other;
1243 }
1244 
1245 static bool isFlatGlobalAddrSpace(unsigned AS) {
1246  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1247  AS == AMDGPUAS::FLAT_ADDRESS ||
1250 }
1251 
1253  unsigned DestAS) const {
1254  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1255 }
1256 
1258  const MemSDNode *MemNode = cast<MemSDNode>(N);
1259  const Value *Ptr = MemNode->getMemOperand()->getValue();
1260  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1261  return I && I->getMetadata("amdgpu.noclobber");
1262 }
1263 
1265  unsigned DestAS) const {
1266  // Flat -> private/local is a simple truncate.
1267  // Flat -> global is no-op
1268  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1269  return true;
1270 
1271  return isNoopAddrSpaceCast(SrcAS, DestAS);
1272 }
1273 
1275  const MemSDNode *MemNode = cast<MemSDNode>(N);
1276 
1277  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1278 }
1279 
1282  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1283  return TypeSplitVector;
1284 
1286 }
1287 
1289  Type *Ty) const {
1290  // FIXME: Could be smarter if called for vector constants.
1291  return true;
1292 }
1293 
1295  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1296  switch (Op) {
1297  case ISD::LOAD:
1298  case ISD::STORE:
1299 
1300  // These operations are done with 32-bit instructions anyway.
1301  case ISD::AND:
1302  case ISD::OR:
1303  case ISD::XOR:
1304  case ISD::SELECT:
1305  // TODO: Extensions?
1306  return true;
1307  default:
1308  return false;
1309  }
1310  }
1311 
1312  // SimplifySetCC uses this function to determine whether or not it should
1313  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1314  if (VT == MVT::i1 && Op == ISD::SETCC)
1315  return false;
1316 
1317  return TargetLowering::isTypeDesirableForOp(Op, VT);
1318 }
1319 
1320 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1321  const SDLoc &SL,
1322  SDValue Chain,
1323  uint64_t Offset) const {
1324  const DataLayout &DL = DAG.getDataLayout();
1325  MachineFunction &MF = DAG.getMachineFunction();
1327 
1328  const ArgDescriptor *InputPtrReg;
1329  const TargetRegisterClass *RC;
1330 
1331  std::tie(InputPtrReg, RC)
1333 
1336  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1337  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1338 
1339  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1340 }
1341 
1342 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1343  const SDLoc &SL) const {
1344  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1345  FIRST_IMPLICIT);
1346  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1347 }
1348 
1349 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1350  const SDLoc &SL, SDValue Val,
1351  bool Signed,
1352  const ISD::InputArg *Arg) const {
1353  // First, if it is a widened vector, narrow it.
1354  if (VT.isVector() &&
1355  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1356  EVT NarrowedVT =
1358  VT.getVectorNumElements());
1359  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1360  DAG.getConstant(0, SL, MVT::i32));
1361  }
1362 
1363  // Then convert the vector elements or scalar value.
1364  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1365  VT.bitsLT(MemVT)) {
1366  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1367  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1368  }
1369 
1370  if (MemVT.isFloatingPoint())
1371  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1372  else if (Signed)
1373  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1374  else
1375  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1376 
1377  return Val;
1378 }
1379 
1380 SDValue SITargetLowering::lowerKernargMemParameter(
1381  SelectionDAG &DAG, EVT VT, EVT MemVT,
1382  const SDLoc &SL, SDValue Chain,
1383  uint64_t Offset, unsigned Align, bool Signed,
1384  const ISD::InputArg *Arg) const {
1385  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1387  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1388 
1389  // Try to avoid using an extload by loading earlier than the argument address,
1390  // and extracting the relevant bits. The load should hopefully be merged with
1391  // the previous argument.
1392  if (MemVT.getStoreSize() < 4 && Align < 4) {
1393  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1394  int64_t AlignDownOffset = alignDown(Offset, 4);
1395  int64_t OffsetDiff = Offset - AlignDownOffset;
1396 
1397  EVT IntVT = MemVT.changeTypeToInteger();
1398 
1399  // TODO: If we passed in the base kernel offset we could have a better
1400  // alignment than 4, but we don't really need it.
1401  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1402  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1405 
1406  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1407  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1408 
1409  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1410  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1411  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1412 
1413 
1414  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1415  }
1416 
1417  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1418  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1421 
1422  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1423  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1424 }
1425 
1426 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1427  const SDLoc &SL, SDValue Chain,
1428  const ISD::InputArg &Arg) const {
1429  MachineFunction &MF = DAG.getMachineFunction();
1430  MachineFrameInfo &MFI = MF.getFrameInfo();
1431 
1432  if (Arg.Flags.isByVal()) {
1433  unsigned Size = Arg.Flags.getByValSize();
1434  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1435  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1436  }
1437 
1438  unsigned ArgOffset = VA.getLocMemOffset();
1439  unsigned ArgSize = VA.getValVT().getStoreSize();
1440 
1441  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1442 
1443  // Create load nodes to retrieve arguments from the stack.
1444  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1445  SDValue ArgValue;
1446 
1447  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1449  MVT MemVT = VA.getValVT();
1450 
1451  switch (VA.getLocInfo()) {
1452  default:
1453  break;
1454  case CCValAssign::BCvt:
1455  MemVT = VA.getLocVT();
1456  break;
1457  case CCValAssign::SExt:
1458  ExtType = ISD::SEXTLOAD;
1459  break;
1460  case CCValAssign::ZExt:
1461  ExtType = ISD::ZEXTLOAD;
1462  break;
1463  case CCValAssign::AExt:
1464  ExtType = ISD::EXTLOAD;
1465  break;
1466  }
1467 
1468  ArgValue = DAG.getExtLoad(
1469  ExtType, SL, VA.getLocVT(), Chain, FIN,
1471  MemVT);
1472  return ArgValue;
1473 }
1474 
1475 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1476  const SIMachineFunctionInfo &MFI,
1477  EVT VT,
1479  const ArgDescriptor *Reg;
1480  const TargetRegisterClass *RC;
1481 
1482  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1483  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1484 }
1485 
1487  CallingConv::ID CallConv,
1489  BitVector &Skipped,
1490  FunctionType *FType,
1492  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1493  const ISD::InputArg *Arg = &Ins[I];
1494 
1495  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1496  "vector type argument should have been split");
1497 
1498  // First check if it's a PS input addr.
1499  if (CallConv == CallingConv::AMDGPU_PS &&
1500  !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1501 
1502  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1503 
1504  // Inconveniently only the first part of the split is marked as isSplit,
1505  // so skip to the end. We only want to increment PSInputNum once for the
1506  // entire split argument.
1507  if (Arg->Flags.isSplit()) {
1508  while (!Arg->Flags.isSplitEnd()) {
1509  assert(!Arg->VT.isVector() &&
1510  "unexpected vector split in ps argument type");
1511  if (!SkipArg)
1512  Splits.push_back(*Arg);
1513  Arg = &Ins[++I];
1514  }
1515  }
1516 
1517  if (SkipArg) {
1518  // We can safely skip PS inputs.
1519  Skipped.set(Arg->getOrigArgIndex());
1520  ++PSInputNum;
1521  continue;
1522  }
1523 
1524  Info->markPSInputAllocated(PSInputNum);
1525  if (Arg->Used)
1526  Info->markPSInputEnabled(PSInputNum);
1527 
1528  ++PSInputNum;
1529  }
1530 
1531  Splits.push_back(*Arg);
1532  }
1533 }
1534 
1535 // Allocate special inputs passed in VGPRs.
1537  MachineFunction &MF,
1538  const SIRegisterInfo &TRI,
1540  if (Info.hasWorkItemIDX()) {
1541  unsigned Reg = AMDGPU::VGPR0;
1542  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1543 
1544  CCInfo.AllocateReg(Reg);
1546  }
1547 
1548  if (Info.hasWorkItemIDY()) {
1549  unsigned Reg = AMDGPU::VGPR1;
1550  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1551 
1552  CCInfo.AllocateReg(Reg);
1554  }
1555 
1556  if (Info.hasWorkItemIDZ()) {
1557  unsigned Reg = AMDGPU::VGPR2;
1558  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1559 
1560  CCInfo.AllocateReg(Reg);
1562  }
1563 }
1564 
1565 // Try to allocate a VGPR at the end of the argument list, or if no argument
1566 // VGPRs are left allocating a stack slot.
1568  ArrayRef<MCPhysReg> ArgVGPRs
1569  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1570  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1571  if (RegIdx == ArgVGPRs.size()) {
1572  // Spill to stack required.
1573  int64_t Offset = CCInfo.AllocateStack(4, 4);
1574 
1575  return ArgDescriptor::createStack(Offset);
1576  }
1577 
1578  unsigned Reg = ArgVGPRs[RegIdx];
1579  Reg = CCInfo.AllocateReg(Reg);
1580  assert(Reg != AMDGPU::NoRegister);
1581 
1582  MachineFunction &MF = CCInfo.getMachineFunction();
1583  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1584  return ArgDescriptor::createRegister(Reg);
1585 }
1586 
1588  const TargetRegisterClass *RC,
1589  unsigned NumArgRegs) {
1590  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1591  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1592  if (RegIdx == ArgSGPRs.size())
1593  report_fatal_error("ran out of SGPRs for arguments");
1594 
1595  unsigned Reg = ArgSGPRs[RegIdx];
1596  Reg = CCInfo.AllocateReg(Reg);
1597  assert(Reg != AMDGPU::NoRegister);
1598 
1599  MachineFunction &MF = CCInfo.getMachineFunction();
1600  MF.addLiveIn(Reg, RC);
1601  return ArgDescriptor::createRegister(Reg);
1602 }
1603 
1605  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1606 }
1607 
1609  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1610 }
1611 
1613  MachineFunction &MF,
1614  const SIRegisterInfo &TRI,
1616  if (Info.hasWorkItemIDX())
1617  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1618 
1619  if (Info.hasWorkItemIDY())
1620  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1621 
1622  if (Info.hasWorkItemIDZ())
1623  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1624 }
1625 
1627  MachineFunction &MF,
1628  const SIRegisterInfo &TRI,
1630  auto &ArgInfo = Info.getArgInfo();
1631 
1632  // TODO: Unify handling with private memory pointers.
1633 
1634  if (Info.hasDispatchPtr())
1635  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1636 
1637  if (Info.hasQueuePtr())
1638  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1639 
1640  if (Info.hasKernargSegmentPtr())
1641  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1642 
1643  if (Info.hasDispatchID())
1644  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1645 
1646  // flat_scratch_init is not applicable for non-kernel functions.
1647 
1648  if (Info.hasWorkGroupIDX())
1649  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1650 
1651  if (Info.hasWorkGroupIDY())
1652  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1653 
1654  if (Info.hasWorkGroupIDZ())
1655  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1656 
1657  if (Info.hasImplicitArgPtr())
1658  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1659 }
1660 
1661 // Allocate special inputs passed in user SGPRs.
1662 static void allocateHSAUserSGPRs(CCState &CCInfo,
1663  MachineFunction &MF,
1664  const SIRegisterInfo &TRI,
1666  if (Info.hasImplicitBufferPtr()) {
1667  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1668  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1669  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1670  }
1671 
1672  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1673  if (Info.hasPrivateSegmentBuffer()) {
1674  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1675  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1676  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1677  }
1678 
1679  if (Info.hasDispatchPtr()) {
1680  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1681  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1682  CCInfo.AllocateReg(DispatchPtrReg);
1683  }
1684 
1685  if (Info.hasQueuePtr()) {
1686  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1687  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1688  CCInfo.AllocateReg(QueuePtrReg);
1689  }
1690 
1691  if (Info.hasKernargSegmentPtr()) {
1692  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1693  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1694  CCInfo.AllocateReg(InputPtrReg);
1695  }
1696 
1697  if (Info.hasDispatchID()) {
1698  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1699  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1700  CCInfo.AllocateReg(DispatchIDReg);
1701  }
1702 
1703  if (Info.hasFlatScratchInit()) {
1704  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1705  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1706  CCInfo.AllocateReg(FlatScratchInitReg);
1707  }
1708 
1709  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1710  // these from the dispatch pointer.
1711 }
1712 
1713 // Allocate special input registers that are initialized per-wave.
1714 static void allocateSystemSGPRs(CCState &CCInfo,
1715  MachineFunction &MF,
1717  CallingConv::ID CallConv,
1718  bool IsShader) {
1719  if (Info.hasWorkGroupIDX()) {
1720  unsigned Reg = Info.addWorkGroupIDX();
1721  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1722  CCInfo.AllocateReg(Reg);
1723  }
1724 
1725  if (Info.hasWorkGroupIDY()) {
1726  unsigned Reg = Info.addWorkGroupIDY();
1727  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1728  CCInfo.AllocateReg(Reg);
1729  }
1730 
1731  if (Info.hasWorkGroupIDZ()) {
1732  unsigned Reg = Info.addWorkGroupIDZ();
1733  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1734  CCInfo.AllocateReg(Reg);
1735  }
1736 
1737  if (Info.hasWorkGroupInfo()) {
1738  unsigned Reg = Info.addWorkGroupInfo();
1739  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1740  CCInfo.AllocateReg(Reg);
1741  }
1742 
1743  if (Info.hasPrivateSegmentWaveByteOffset()) {
1744  // Scratch wave offset passed in system SGPR.
1745  unsigned PrivateSegmentWaveByteOffsetReg;
1746 
1747  if (IsShader) {
1748  PrivateSegmentWaveByteOffsetReg =
1750 
1751  // This is true if the scratch wave byte offset doesn't have a fixed
1752  // location.
1753  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1754  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1755  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1756  }
1757  } else
1758  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1759 
1760  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1761  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1762  }
1763 }
1764 
1766  MachineFunction &MF,
1767  const SIRegisterInfo &TRI,
1769  // Now that we've figured out where the scratch register inputs are, see if
1770  // should reserve the arguments and use them directly.
1771  MachineFrameInfo &MFI = MF.getFrameInfo();
1772  bool HasStackObjects = MFI.hasStackObjects();
1773 
1774  // Record that we know we have non-spill stack objects so we don't need to
1775  // check all stack objects later.
1776  if (HasStackObjects)
1777  Info.setHasNonSpillStackObjects(true);
1778 
1779  // Everything live out of a block is spilled with fast regalloc, so it's
1780  // almost certain that spilling will be required.
1781  if (TM.getOptLevel() == CodeGenOpt::None)
1782  HasStackObjects = true;
1783 
1784  // For now assume stack access is needed in any callee functions, so we need
1785  // the scratch registers to pass in.
1786  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1787 
1788  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1789  if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1790  if (RequiresStackAccess) {
1791  // If we have stack objects, we unquestionably need the private buffer
1792  // resource. For the Code Object V2 ABI, this will be the first 4 user
1793  // SGPR inputs. We can reserve those and use them directly.
1794 
1795  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1797  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1798 
1799  if (MFI.hasCalls()) {
1800  // If we have calls, we need to keep the frame register in a register
1801  // that won't be clobbered by a call, so ensure it is copied somewhere.
1802 
1803  // This is not a problem for the scratch wave offset, because the same
1804  // registers are reserved in all functions.
1805 
1806  // FIXME: Nothing is really ensuring this is a call preserved register,
1807  // it's just selected from the end so it happens to be.
1808  unsigned ReservedOffsetReg
1810  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1811  } else {
1812  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1814  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1815  }
1816  } else {
1817  unsigned ReservedBufferReg
1819  unsigned ReservedOffsetReg
1821 
1822  // We tentatively reserve the last registers (skipping the last two
1823  // which may contain VCC). After register allocation, we'll replace
1824  // these with the ones immediately after those which were really
1825  // allocated. In the prologue copies will be inserted from the argument
1826  // to these reserved registers.
1827  Info.setScratchRSrcReg(ReservedBufferReg);
1828  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1829  }
1830  } else {
1831  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1832 
1833  // Without HSA, relocations are used for the scratch pointer and the
1834  // buffer resource setup is always inserted in the prologue. Scratch wave
1835  // offset is still in an input SGPR.
1836  Info.setScratchRSrcReg(ReservedBufferReg);
1837 
1838  if (HasStackObjects && !MFI.hasCalls()) {
1839  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1841  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1842  } else {
1843  unsigned ReservedOffsetReg
1845  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1846  }
1847  }
1848 }
1849 
1852  return !Info->isEntryFunction();
1853 }
1854 
1856 
1857 }
1858 
1860  MachineBasicBlock *Entry,
1861  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1863 
1864  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1865  if (!IStart)
1866  return;
1867 
1868  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1869  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1870  MachineBasicBlock::iterator MBBI = Entry->begin();
1871  for (const MCPhysReg *I = IStart; *I; ++I) {
1872  const TargetRegisterClass *RC = nullptr;
1873  if (AMDGPU::SReg_64RegClass.contains(*I))
1874  RC = &AMDGPU::SGPR_64RegClass;
1875  else if (AMDGPU::SReg_32RegClass.contains(*I))
1876  RC = &AMDGPU::SGPR_32RegClass;
1877  else
1878  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1879 
1880  unsigned NewVR = MRI->createVirtualRegister(RC);
1881  // Create copy from CSR to a virtual register.
1882  Entry->addLiveIn(*I);
1883  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1884  .addReg(*I);
1885 
1886  // Insert the copy-back instructions right before the terminator.
1887  for (auto *Exit : Exits)
1888  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1889  TII->get(TargetOpcode::COPY), *I)
1890  .addReg(NewVR);
1891  }
1892 }
1893 
1895  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1896  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1897  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1899 
1900  MachineFunction &MF = DAG.getMachineFunction();
1901  const Function &Fn = MF.getFunction();
1902  FunctionType *FType = MF.getFunction().getFunctionType();
1904 
1905  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1906  DiagnosticInfoUnsupported NoGraphicsHSA(
1907  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1908  DAG.getContext()->diagnose(NoGraphicsHSA);
1909  return DAG.getEntryNode();
1910  }
1911 
1914  BitVector Skipped(Ins.size());
1915  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1916  *DAG.getContext());
1917 
1918  bool IsShader = AMDGPU::isShader(CallConv);
1919  bool IsKernel = AMDGPU::isKernel(CallConv);
1920  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1921 
1922  if (!IsEntryFunc) {
1923  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1924  // this when allocating argument fixed offsets.
1925  CCInfo.AllocateStack(4, 4);
1926  }
1927 
1928  if (IsShader) {
1929  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1930 
1931  // At least one interpolation mode must be enabled or else the GPU will
1932  // hang.
1933  //
1934  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1935  // set PSInputAddr, the user wants to enable some bits after the compilation
1936  // based on run-time states. Since we can't know what the final PSInputEna
1937  // will look like, so we shouldn't do anything here and the user should take
1938  // responsibility for the correct programming.
1939  //
1940  // Otherwise, the following restrictions apply:
1941  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1942  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1943  // enabled too.
1944  if (CallConv == CallingConv::AMDGPU_PS) {
1945  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1946  ((Info->getPSInputAddr() & 0xF) == 0 &&
1947  Info->isPSInputAllocated(11))) {
1948  CCInfo.AllocateReg(AMDGPU::VGPR0);
1949  CCInfo.AllocateReg(AMDGPU::VGPR1);
1950  Info->markPSInputAllocated(0);
1951  Info->markPSInputEnabled(0);
1952  }
1953  if (Subtarget->isAmdPalOS()) {
1954  // For isAmdPalOS, the user does not enable some bits after compilation
1955  // based on run-time states; the register values being generated here are
1956  // the final ones set in hardware. Therefore we need to apply the
1957  // workaround to PSInputAddr and PSInputEnable together. (The case where
1958  // a bit is set in PSInputAddr but not PSInputEnable is where the
1959  // frontend set up an input arg for a particular interpolation mode, but
1960  // nothing uses that input arg. Really we should have an earlier pass
1961  // that removes such an arg.)
1962  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1963  if ((PsInputBits & 0x7F) == 0 ||
1964  ((PsInputBits & 0xF) == 0 &&
1965  (PsInputBits >> 11 & 1)))
1966  Info->markPSInputEnabled(
1968  }
1969  }
1970 
1971  assert(!Info->hasDispatchPtr() &&
1972  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1973  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1974  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1975  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1976  !Info->hasWorkItemIDZ());
1977  } else if (IsKernel) {
1978  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1979  } else {
1980  Splits.append(Ins.begin(), Ins.end());
1981  }
1982 
1983  if (IsEntryFunc) {
1984  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1985  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1986  }
1987 
1988  if (IsKernel) {
1989  analyzeFormalArgumentsCompute(CCInfo, Ins);
1990  } else {
1991  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1992  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1993  }
1994 
1995  SmallVector<SDValue, 16> Chains;
1996 
1997  // FIXME: This is the minimum kernel argument alignment. We should improve
1998  // this to the maximum alignment of the arguments.
1999  //
2000  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2001  // kern arg offset.
2002  const unsigned KernelArgBaseAlign = 16;
2003 
2004  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2005  const ISD::InputArg &Arg = Ins[i];
2006  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2007  InVals.push_back(DAG.getUNDEF(Arg.VT));
2008  continue;
2009  }
2010 
2011  CCValAssign &VA = ArgLocs[ArgIdx++];
2012  MVT VT = VA.getLocVT();
2013 
2014  if (IsEntryFunc && VA.isMemLoc()) {
2015  VT = Ins[i].VT;
2016  EVT MemVT = VA.getLocVT();
2017 
2018  const uint64_t Offset = VA.getLocMemOffset();
2019  unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
2020 
2021  SDValue Arg = lowerKernargMemParameter(
2022  DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
2023  Chains.push_back(Arg.getValue(1));
2024 
2025  auto *ParamTy =
2026  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2027  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2028  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2029  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2030  // On SI local pointers are just offsets into LDS, so they are always
2031  // less than 16-bits. On CI and newer they could potentially be
2032  // real pointers, so we can't guarantee their size.
2033  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2034  DAG.getValueType(MVT::i16));
2035  }
2036 
2037  InVals.push_back(Arg);
2038  continue;
2039  } else if (!IsEntryFunc && VA.isMemLoc()) {
2040  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2041  InVals.push_back(Val);
2042  if (!Arg.Flags.isByVal())
2043  Chains.push_back(Val.getValue(1));
2044  continue;
2045  }
2046 
2047  assert(VA.isRegLoc() && "Parameter must be in a register!");
2048 
2049  unsigned Reg = VA.getLocReg();
2050  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2051  EVT ValVT = VA.getValVT();
2052 
2053  Reg = MF.addLiveIn(Reg, RC);
2054  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2055 
2056  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2057  // The return object should be reasonably addressable.
2058 
2059  // FIXME: This helps when the return is a real sret. If it is a
2060  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2061  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2062  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2063  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2064  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2065  }
2066 
2067  // If this is an 8 or 16-bit value, it is really passed promoted
2068  // to 32 bits. Insert an assert[sz]ext to capture this, then
2069  // truncate to the right size.
2070  switch (VA.getLocInfo()) {
2071  case CCValAssign::Full:
2072  break;
2073  case CCValAssign::BCvt:
2074  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2075  break;
2076  case CCValAssign::SExt:
2077  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2078  DAG.getValueType(ValVT));
2079  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2080  break;
2081  case CCValAssign::ZExt:
2082  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2083  DAG.getValueType(ValVT));
2084  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2085  break;
2086  case CCValAssign::AExt:
2087  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2088  break;
2089  default:
2090  llvm_unreachable("Unknown loc info!");
2091  }
2092 
2093  InVals.push_back(Val);
2094  }
2095 
2096  if (!IsEntryFunc) {
2097  // Special inputs come after user arguments.
2098  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2099  }
2100 
2101  // Start adding system SGPRs.
2102  if (IsEntryFunc) {
2103  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2104  } else {
2105  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2106  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2107  CCInfo.AllocateReg(Info->getFrameOffsetReg());
2108  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2109  }
2110 
2111  auto &ArgUsageInfo =
2113  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2114 
2115  unsigned StackArgSize = CCInfo.getNextStackOffset();
2116  Info->setBytesInStackArgArea(StackArgSize);
2117 
2118  return Chains.empty() ? Chain :
2119  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2120 }
2121 
2122 // TODO: If return values can't fit in registers, we should return as many as
2123 // possible in registers before passing on stack.
2125  CallingConv::ID CallConv,
2126  MachineFunction &MF, bool IsVarArg,
2127  const SmallVectorImpl<ISD::OutputArg> &Outs,
2128  LLVMContext &Context) const {
2129  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2130  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2131  // for shaders. Vector types should be explicitly handled by CC.
2132  if (AMDGPU::isEntryFunctionCC(CallConv))
2133  return true;
2134 
2136  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2137  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2138 }
2139 
2140 SDValue
2142  bool isVarArg,
2143  const SmallVectorImpl<ISD::OutputArg> &Outs,
2144  const SmallVectorImpl<SDValue> &OutVals,
2145  const SDLoc &DL, SelectionDAG &DAG) const {
2146  MachineFunction &MF = DAG.getMachineFunction();
2148 
2149  if (AMDGPU::isKernel(CallConv)) {
2150  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2151  OutVals, DL, DAG);
2152  }
2153 
2154  bool IsShader = AMDGPU::isShader(CallConv);
2155 
2156  Info->setIfReturnsVoid(Outs.empty());
2157  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2158 
2159  // CCValAssign - represent the assignment of the return value to a location.
2162 
2163  // CCState - Info about the registers and stack slots.
2164  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2165  *DAG.getContext());
2166 
2167  // Analyze outgoing return values.
2168  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2169 
2170  SDValue Flag;
2171  SmallVector<SDValue, 48> RetOps;
2172  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2173 
2174  // Add return address for callable functions.
2175  if (!Info->isEntryFunction()) {
2177  SDValue ReturnAddrReg = CreateLiveInRegister(
2178  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2179 
2180  // FIXME: Should be able to use a vreg here, but need a way to prevent it
2181  // from being allcoated to a CSR.
2182 
2183  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2184  MVT::i64);
2185 
2186  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2187  Flag = Chain.getValue(1);
2188 
2189  RetOps.push_back(PhysReturnAddrReg);
2190  }
2191 
2192  // Copy the result values into the output registers.
2193  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2194  ++I, ++RealRVLocIdx) {
2195  CCValAssign &VA = RVLocs[I];
2196  assert(VA.isRegLoc() && "Can only return in registers!");
2197  // TODO: Partially return in registers if return values don't fit.
2198  SDValue Arg = OutVals[RealRVLocIdx];
2199 
2200  // Copied from other backends.
2201  switch (VA.getLocInfo()) {
2202  case CCValAssign::Full:
2203  break;
2204  case CCValAssign::BCvt:
2205  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2206  break;
2207  case CCValAssign::SExt:
2208  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2209  break;
2210  case CCValAssign::ZExt:
2211  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2212  break;
2213  case CCValAssign::AExt:
2214  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2215  break;
2216  default:
2217  llvm_unreachable("Unknown loc info!");
2218  }
2219 
2220  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2221  Flag = Chain.getValue(1);
2222  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2223  }
2224 
2225  // FIXME: Does sret work properly?
2226  if (!Info->isEntryFunction()) {
2227  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2228  const MCPhysReg *I =
2230  if (I) {
2231  for (; *I; ++I) {
2232  if (AMDGPU::SReg_64RegClass.contains(*I))
2233  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2234  else if (AMDGPU::SReg_32RegClass.contains(*I))
2235  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2236  else
2237  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2238  }
2239  }
2240  }
2241 
2242  // Update chain and glue.
2243  RetOps[0] = Chain;
2244  if (Flag.getNode())
2245  RetOps.push_back(Flag);
2246 
2247  unsigned Opc = AMDGPUISD::ENDPGM;
2248  if (!IsWaveEnd)
2250  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2251 }
2252 
2254  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2255  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2256  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2257  SDValue ThisVal) const {
2258  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2259 
2260  // Assign locations to each value returned by this call.
2262  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2263  *DAG.getContext());
2264  CCInfo.AnalyzeCallResult(Ins, RetCC);
2265 
2266  // Copy all of the result registers out of their specified physreg.
2267  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2268  CCValAssign VA = RVLocs[i];
2269  SDValue Val;
2270 
2271  if (VA.isRegLoc()) {
2272  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2273  Chain = Val.getValue(1);
2274  InFlag = Val.getValue(2);
2275  } else if (VA.isMemLoc()) {
2276  report_fatal_error("TODO: return values in memory");
2277  } else
2278  llvm_unreachable("unknown argument location type");
2279 
2280  switch (VA.getLocInfo()) {
2281  case CCValAssign::Full:
2282  break;
2283  case CCValAssign::BCvt:
2284  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2285  break;
2286  case CCValAssign::ZExt:
2287  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2288  DAG.getValueType(VA.getValVT()));
2289  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2290  break;
2291  case CCValAssign::SExt:
2292  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2293  DAG.getValueType(VA.getValVT()));
2294  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2295  break;
2296  case CCValAssign::AExt:
2297  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2298  break;
2299  default:
2300  llvm_unreachable("Unknown loc info!");
2301  }
2302 
2303  InVals.push_back(Val);
2304  }
2305 
2306  return Chain;
2307 }
2308 
2309 // Add code to pass special inputs required depending on used features separate
2310 // from the explicit user arguments present in the IR.
2312  CallLoweringInfo &CLI,
2313  CCState &CCInfo,
2314  const SIMachineFunctionInfo &Info,
2315  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2316  SmallVectorImpl<SDValue> &MemOpChains,
2317  SDValue Chain) const {
2318  // If we don't have a call site, this was a call inserted by
2319  // legalization. These can never use special inputs.
2320  if (!CLI.CS)
2321  return;
2322 
2323  const Function *CalleeFunc = CLI.CS.getCalledFunction();
2324  assert(CalleeFunc);
2325 
2326  SelectionDAG &DAG = CLI.DAG;
2327  const SDLoc &DL = CLI.DL;
2328 
2329  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2330 
2331  auto &ArgUsageInfo =
2333  const AMDGPUFunctionArgInfo &CalleeArgInfo
2334  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2335 
2336  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2337 
2338  // TODO: Unify with private memory register handling. This is complicated by
2339  // the fact that at least in kernels, the input argument is not necessarily
2340  // in the same location as the input.
2353  };
2354 
2355  for (auto InputID : InputRegs) {
2356  const ArgDescriptor *OutgoingArg;
2357  const TargetRegisterClass *ArgRC;
2358 
2359  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2360  if (!OutgoingArg)
2361  continue;
2362 
2363  const ArgDescriptor *IncomingArg;
2364  const TargetRegisterClass *IncomingArgRC;
2365  std::tie(IncomingArg, IncomingArgRC)
2366  = CallerArgInfo.getPreloadedValue(InputID);
2367  assert(IncomingArgRC == ArgRC);
2368 
2369  // All special arguments are ints for now.
2370  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2371  SDValue InputReg;
2372 
2373  if (IncomingArg) {
2374  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2375  } else {
2376  // The implicit arg ptr is special because it doesn't have a corresponding
2377  // input for kernels, and is computed from the kernarg segment pointer.
2378  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2379  InputReg = getImplicitArgPtr(DAG, DL);
2380  }
2381 
2382  if (OutgoingArg->isRegister()) {
2383  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2384  } else {
2385  unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2386  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2387  SpecialArgOffset);
2388  MemOpChains.push_back(ArgStore);
2389  }
2390  }
2391 }
2392 
2394  return CC == CallingConv::Fast;
2395 }
2396 
2397 /// Return true if we might ever do TCO for calls with this calling convention.
2399  switch (CC) {
2400  case CallingConv::C:
2401  return true;
2402  default:
2403  return canGuaranteeTCO(CC);
2404  }
2405 }
2406 
2408  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2409  const SmallVectorImpl<ISD::OutputArg> &Outs,
2410  const SmallVectorImpl<SDValue> &OutVals,
2411  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2412  if (!mayTailCallThisCC(CalleeCC))
2413  return false;
2414 
2415  MachineFunction &MF = DAG.getMachineFunction();
2416  const Function &CallerF = MF.getFunction();
2417  CallingConv::ID CallerCC = CallerF.getCallingConv();
2419  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2420 
2421  // Kernels aren't callable, and don't have a live in return address so it
2422  // doesn't make sense to do a tail call with entry functions.
2423  if (!CallerPreserved)
2424  return false;
2425 
2426  bool CCMatch = CallerCC == CalleeCC;
2427 
2429  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2430  return true;
2431  return false;
2432  }
2433 
2434  // TODO: Can we handle var args?
2435  if (IsVarArg)
2436  return false;
2437 
2438  for (const Argument &Arg : CallerF.args()) {
2439  if (Arg.hasByValAttr())
2440  return false;
2441  }
2442 
2443  LLVMContext &Ctx = *DAG.getContext();
2444 
2445  // Check that the call results are passed in the same way.
2446  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2447  CCAssignFnForCall(CalleeCC, IsVarArg),
2448  CCAssignFnForCall(CallerCC, IsVarArg)))
2449  return false;
2450 
2451  // The callee has to preserve all registers the caller needs to preserve.
2452  if (!CCMatch) {
2453  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2454  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2455  return false;
2456  }
2457 
2458  // Nothing more to check if the callee is taking no arguments.
2459  if (Outs.empty())
2460  return true;
2461 
2463  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2464 
2465  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2466 
2467  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2468  // If the stack arguments for this call do not fit into our own save area then
2469  // the call cannot be made tail.
2470  // TODO: Is this really necessary?
2471  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2472  return false;
2473 
2474  const MachineRegisterInfo &MRI = MF.getRegInfo();
2475  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2476 }
2477 
2479  if (!CI->isTailCall())
2480  return false;
2481 
2482  const Function *ParentFn = CI->getParent()->getParent();
2483  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2484  return false;
2485 
2486  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2487  return (Attr.getValueAsString() != "true");
2488 }
2489 
2490 // The wave scratch offset register is used as the global base pointer.
2492  SmallVectorImpl<SDValue> &InVals) const {
2493  SelectionDAG &DAG = CLI.DAG;
2494  const SDLoc &DL = CLI.DL;
2496  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2498  SDValue Chain = CLI.Chain;
2499  SDValue Callee = CLI.Callee;
2500  bool &IsTailCall = CLI.IsTailCall;
2501  CallingConv::ID CallConv = CLI.CallConv;
2502  bool IsVarArg = CLI.IsVarArg;
2503  bool IsSibCall = false;
2504  bool IsThisReturn = false;
2505  MachineFunction &MF = DAG.getMachineFunction();
2506 
2507  if (IsVarArg) {
2508  return lowerUnhandledCall(CLI, InVals,
2509  "unsupported call to variadic function ");
2510  }
2511 
2512  if (!CLI.CS.getInstruction())
2513  report_fatal_error("unsupported libcall legalization");
2514 
2515  if (!CLI.CS.getCalledFunction()) {
2516  return lowerUnhandledCall(CLI, InVals,
2517  "unsupported indirect call to function ");
2518  }
2519 
2520  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2521  return lowerUnhandledCall(CLI, InVals,
2522  "unsupported required tail call to function ");
2523  }
2524 
2526  // Note the issue is with the CC of the calling function, not of the call
2527  // itself.
2528  return lowerUnhandledCall(CLI, InVals,
2529  "unsupported call from graphics shader of function ");
2530  }
2531 
2532  // The first 4 bytes are reserved for the callee's emergency stack slot.
2533  if (IsTailCall) {
2534  IsTailCall = isEligibleForTailCallOptimization(
2535  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2536  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2537  report_fatal_error("failed to perform tail call elimination on a call "
2538  "site marked musttail");
2539  }
2540 
2541  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2542 
2543  // A sibling call is one where we're under the usual C ABI and not planning
2544  // to change that but can still do a tail call:
2545  if (!TailCallOpt && IsTailCall)
2546  IsSibCall = true;
2547 
2548  if (IsTailCall)
2549  ++NumTailCalls;
2550  }
2551 
2553 
2554  // Analyze operands of the call, assigning locations to each operand.
2556  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2557  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2558 
2559  // The first 4 bytes are reserved for the callee's emergency stack slot.
2560  CCInfo.AllocateStack(4, 4);
2561 
2562  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2563 
2564  // Get a count of how many bytes are to be pushed on the stack.
2565  unsigned NumBytes = CCInfo.getNextStackOffset();
2566 
2567  if (IsSibCall) {
2568  // Since we're not changing the ABI to make this a tail call, the memory
2569  // operands are already available in the caller's incoming argument space.
2570  NumBytes = 0;
2571  }
2572 
2573  // FPDiff is the byte offset of the call's argument area from the callee's.
2574  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2575  // by this amount for a tail call. In a sibling call it must be 0 because the
2576  // caller will deallocate the entire stack and the callee still expects its
2577  // arguments to begin at SP+0. Completely unused for non-tail calls.
2578  int32_t FPDiff = 0;
2579  MachineFrameInfo &MFI = MF.getFrameInfo();
2581 
2582  SDValue CallerSavedFP;
2583 
2584  // Adjust the stack pointer for the new arguments...
2585  // These operations are automatically eliminated by the prolog/epilog pass
2586  if (!IsSibCall) {
2587  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2588 
2589  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2590 
2591  // In the HSA case, this should be an identity copy.
2592  SDValue ScratchRSrcReg
2593  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2594  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2595 
2596  // TODO: Don't hardcode these registers and get from the callee function.
2597  SDValue ScratchWaveOffsetReg
2598  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2599  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2600 
2601  if (!Info->isEntryFunction()) {
2602  // Avoid clobbering this function's FP value. In the current convention
2603  // callee will overwrite this, so do save/restore around the call site.
2604  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2605  Info->getFrameOffsetReg(), MVT::i32);
2606  }
2607  }
2608 
2609  SmallVector<SDValue, 8> MemOpChains;
2610  MVT PtrVT = MVT::i32;
2611 
2612  // Walk the register/memloc assignments, inserting copies/loads.
2613  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2614  ++i, ++realArgIdx) {
2615  CCValAssign &VA = ArgLocs[i];
2616  SDValue Arg = OutVals[realArgIdx];
2617 
2618  // Promote the value if needed.
2619  switch (VA.getLocInfo()) {
2620  case CCValAssign::Full:
2621  break;
2622  case CCValAssign::BCvt:
2623  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2624  break;
2625  case CCValAssign::ZExt:
2626  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2627  break;
2628  case CCValAssign::SExt:
2629  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2630  break;
2631  case CCValAssign::AExt:
2632  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2633  break;
2634  case CCValAssign::FPExt:
2635  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2636  break;
2637  default:
2638  llvm_unreachable("Unknown loc info!");
2639  }
2640 
2641  if (VA.isRegLoc()) {
2642  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2643  } else {
2644  assert(VA.isMemLoc());
2645 
2646  SDValue DstAddr;
2647  MachinePointerInfo DstInfo;
2648 
2649  unsigned LocMemOffset = VA.getLocMemOffset();
2650  int32_t Offset = LocMemOffset;
2651 
2652  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2653  unsigned Align = 0;
2654 
2655  if (IsTailCall) {
2656  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2657  unsigned OpSize = Flags.isByVal() ?
2658  Flags.getByValSize() : VA.getValVT().getStoreSize();
2659 
2660  // FIXME: We can have better than the minimum byval required alignment.
2661  Align = Flags.isByVal() ? Flags.getByValAlign() :
2662  MinAlign(Subtarget->getStackAlignment(), Offset);
2663 
2664  Offset = Offset + FPDiff;
2665  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2666 
2667  DstAddr = DAG.getFrameIndex(FI, PtrVT);
2668  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2669 
2670  // Make sure any stack arguments overlapping with where we're storing
2671  // are loaded before this eventual operation. Otherwise they'll be
2672  // clobbered.
2673 
2674  // FIXME: Why is this really necessary? This seems to just result in a
2675  // lot of code to copy the stack and write them back to the same
2676  // locations, which are supposed to be immutable?
2677  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2678  } else {
2679  DstAddr = PtrOff;
2680  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2681  Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2682  }
2683 
2684  if (Outs[i].Flags.isByVal()) {
2685  SDValue SizeNode =
2686  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2687  SDValue Cpy = DAG.getMemcpy(
2688  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2689  /*isVol = */ false, /*AlwaysInline = */ true,
2690  /*isTailCall = */ false, DstInfo,
2693 
2694  MemOpChains.push_back(Cpy);
2695  } else {
2696  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2697  MemOpChains.push_back(Store);
2698  }
2699  }
2700  }
2701 
2702  // Copy special input registers after user input arguments.
2703  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2704 
2705  if (!MemOpChains.empty())
2706  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2707 
2708  // Build a sequence of copy-to-reg nodes chained together with token chain
2709  // and flag operands which copy the outgoing args into the appropriate regs.
2710  SDValue InFlag;
2711  for (auto &RegToPass : RegsToPass) {
2712  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2713  RegToPass.second, InFlag);
2714  InFlag = Chain.getValue(1);
2715  }
2716 
2717 
2718  SDValue PhysReturnAddrReg;
2719  if (IsTailCall) {
2720  // Since the return is being combined with the call, we need to pass on the
2721  // return address.
2722 
2724  SDValue ReturnAddrReg = CreateLiveInRegister(
2725  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2726 
2727  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2728  MVT::i64);
2729  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2730  InFlag = Chain.getValue(1);
2731  }
2732 
2733  // We don't usually want to end the call-sequence here because we would tidy
2734  // the frame up *after* the call, however in the ABI-changing tail-call case
2735  // we've carefully laid out the parameters so that when sp is reset they'll be
2736  // in the correct location.
2737  if (IsTailCall && !IsSibCall) {
2738  Chain = DAG.getCALLSEQ_END(Chain,
2739  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2740  DAG.getTargetConstant(0, DL, MVT::i32),
2741  InFlag, DL);
2742  InFlag = Chain.getValue(1);
2743  }
2744 
2745  std::vector<SDValue> Ops;
2746  Ops.push_back(Chain);
2747  Ops.push_back(Callee);
2748  // Add a redundant copy of the callee global which will not be legalized, as
2749  // we need direct access to the callee later.
2750  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2751  const GlobalValue *GV = GSD->getGlobal();
2752  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
2753 
2754  if (IsTailCall) {
2755  // Each tail call may have to adjust the stack by a different amount, so
2756  // this information must travel along with the operation for eventual
2757  // consumption by emitEpilogue.
2758  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2759 
2760  Ops.push_back(PhysReturnAddrReg);
2761  }
2762 
2763  // Add argument registers to the end of the list so that they are known live
2764  // into the call.
2765  for (auto &RegToPass : RegsToPass) {
2766  Ops.push_back(DAG.getRegister(RegToPass.first,
2767  RegToPass.second.getValueType()));
2768  }
2769 
2770  // Add a register mask operand representing the call-preserved registers.
2771 
2772  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2773  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2774  assert(Mask && "Missing call preserved mask for calling convention");
2775  Ops.push_back(DAG.getRegisterMask(Mask));
2776 
2777  if (InFlag.getNode())
2778  Ops.push_back(InFlag);
2779 
2780  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2781 
2782  // If we're doing a tall call, use a TC_RETURN here rather than an
2783  // actual call instruction.
2784  if (IsTailCall) {
2785  MFI.setHasTailCall();
2786  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2787  }
2788 
2789  // Returns a chain and a flag for retval copy to use.
2790  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2791  Chain = Call.getValue(0);
2792  InFlag = Call.getValue(1);
2793 
2794  if (CallerSavedFP) {
2795  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2796  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2797  InFlag = Chain.getValue(1);
2798  }
2799 
2800  uint64_t CalleePopBytes = NumBytes;
2801  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2802  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2803  InFlag, DL);
2804  if (!Ins.empty())
2805  InFlag = Chain.getValue(1);
2806 
2807  // Handle result values, copying them out of physregs into vregs that we
2808  // return.
2809  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2810  InVals, IsThisReturn,
2811  IsThisReturn ? OutVals[0] : SDValue());
2812 }
2813 
2814 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2815  SelectionDAG &DAG) const {
2816  unsigned Reg = StringSwitch<unsigned>(RegName)
2817  .Case("m0", AMDGPU::M0)
2818  .Case("exec", AMDGPU::EXEC)
2819  .Case("exec_lo", AMDGPU::EXEC_LO)
2820  .Case("exec_hi", AMDGPU::EXEC_HI)
2821  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2822  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2823  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2824  .Default(AMDGPU::NoRegister);
2825 
2826  if (Reg == AMDGPU::NoRegister) {
2827  report_fatal_error(Twine("invalid register name \""
2828  + StringRef(RegName) + "\"."));
2829 
2830  }
2831 
2832  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2833  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2834  report_fatal_error(Twine("invalid register \""
2835  + StringRef(RegName) + "\" for subtarget."));
2836  }
2837 
2838  switch (Reg) {
2839  case AMDGPU::M0:
2840  case AMDGPU::EXEC_LO:
2841  case AMDGPU::EXEC_HI:
2842  case AMDGPU::FLAT_SCR_LO:
2843  case AMDGPU::FLAT_SCR_HI:
2844  if (VT.getSizeInBits() == 32)
2845  return Reg;
2846  break;
2847  case AMDGPU::EXEC:
2848  case AMDGPU::FLAT_SCR:
2849  if (VT.getSizeInBits() == 64)
2850  return Reg;
2851  break;
2852  default:
2853  llvm_unreachable("missing register type checking");
2854  }
2855 
2856  report_fatal_error(Twine("invalid type for register \""
2857  + StringRef(RegName) + "\"."));
2858 }
2859 
2860 // If kill is not the last instruction, split the block so kill is always a
2861 // proper terminator.
2863  MachineBasicBlock *BB) const {
2864  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2865 
2866  MachineBasicBlock::iterator SplitPoint(&MI);
2867  ++SplitPoint;
2868 
2869  if (SplitPoint == BB->end()) {
2870  // Don't bother with a new block.
2872  return BB;
2873  }
2874 
2875  MachineFunction *MF = BB->getParent();
2876  MachineBasicBlock *SplitBB
2878 
2879  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2880  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2881 
2882  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2883  BB->addSuccessor(SplitBB);
2884 
2886  return SplitBB;
2887 }
2888 
2889 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2890 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2891 // will only do one iteration. In the worst case, this will loop 64 times.
2892 //
2893 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2895  const SIInstrInfo *TII,
2897  MachineBasicBlock &OrigBB,
2898  MachineBasicBlock &LoopBB,
2899  const DebugLoc &DL,
2900  const MachineOperand &IdxReg,
2901  unsigned InitReg,
2902  unsigned ResultReg,
2903  unsigned PhiReg,
2904  unsigned InitSaveExecReg,
2905  int Offset,
2906  bool UseGPRIdxMode,
2907  bool IsIndirectSrc) {
2908  MachineBasicBlock::iterator I = LoopBB.begin();
2909 
2910  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2911  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2912  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2913  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2914 
2915  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2916  .addReg(InitReg)
2917  .addMBB(&OrigBB)
2918  .addReg(ResultReg)
2919  .addMBB(&LoopBB);
2920 
2921  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2922  .addReg(InitSaveExecReg)
2923  .addMBB(&OrigBB)
2924  .addReg(NewExec)
2925  .addMBB(&LoopBB);
2926 
2927  // Read the next variant <- also loop target.
2928  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2929  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2930 
2931  // Compare the just read M0 value to all possible Idx values.
2932  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2933  .addReg(CurrentIdxReg)
2934  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2935 
2936  // Update EXEC, save the original EXEC value to VCC.
2937  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2938  .addReg(CondReg, RegState::Kill);
2939 
2940  MRI.setSimpleHint(NewExec, CondReg);
2941 
2942  if (UseGPRIdxMode) {
2943  unsigned IdxReg;
2944  if (Offset == 0) {
2945  IdxReg = CurrentIdxReg;
2946  } else {
2947  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2948  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2949  .addReg(CurrentIdxReg, RegState::Kill)
2950  .addImm(Offset);
2951  }
2952  unsigned IdxMode = IsIndirectSrc ?
2954  MachineInstr *SetOn =
2955  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2956  .addReg(IdxReg, RegState::Kill)
2957  .addImm(IdxMode);
2958  SetOn->getOperand(3).setIsUndef();
2959  } else {
2960  // Move index from VCC into M0
2961  if (Offset == 0) {
2962  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2963  .addReg(CurrentIdxReg, RegState::Kill);
2964  } else {
2965  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2966  .addReg(CurrentIdxReg, RegState::Kill)
2967  .addImm(Offset);
2968  }
2969  }
2970 
2971  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2972  MachineInstr *InsertPt =
2973  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
2974  .addReg(AMDGPU::EXEC)
2975  .addReg(NewExec);
2976 
2977  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2978  // s_cbranch_scc0?
2979 
2980  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2981  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2982  .addMBB(&LoopBB);
2983 
2984  return InsertPt->getIterator();
2985 }
2986 
2987 // This has slightly sub-optimal regalloc when the source vector is killed by
2988 // the read. The register allocator does not understand that the kill is
2989 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2990 // subregister from it, using 1 more VGPR than necessary. This was saved when
2991 // this was expanded after register allocation.
2993  MachineBasicBlock &MBB,
2994  MachineInstr &MI,
2995  unsigned InitResultReg,
2996  unsigned PhiReg,
2997  int Offset,
2998  bool UseGPRIdxMode,
2999  bool IsIndirectSrc) {
3000  MachineFunction *MF = MBB.getParent();
3002  const DebugLoc &DL = MI.getDebugLoc();
3004 
3005  unsigned DstReg = MI.getOperand(0).getReg();
3006  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3007  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3008 
3009  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3010 
3011  // Save the EXEC mask
3012  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
3013  .addReg(AMDGPU::EXEC);
3014 
3015  // To insert the loop we need to split the block. Move everything after this
3016  // point to a new block, and insert a new empty block between the two.
3018  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3019  MachineFunction::iterator MBBI(MBB);
3020  ++MBBI;
3021 
3022  MF->insert(MBBI, LoopBB);
3023  MF->insert(MBBI, RemainderBB);
3024 
3025  LoopBB->addSuccessor(LoopBB);
3026  LoopBB->addSuccessor(RemainderBB);
3027 
3028  // Move the rest of the block into a new block.
3029  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3030  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3031 
3032  MBB.addSuccessor(LoopBB);
3033 
3034  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3035 
3036  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3037  InitResultReg, DstReg, PhiReg, TmpExec,
3038  Offset, UseGPRIdxMode, IsIndirectSrc);
3039 
3040  MachineBasicBlock::iterator First = RemainderBB->begin();
3041  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3042  .addReg(SaveExec);
3043 
3044  return InsPt;
3045 }
3046 
3047 // Returns subreg index, offset
3048 static std::pair<unsigned, int>
3050  const TargetRegisterClass *SuperRC,
3051  unsigned VecReg,
3052  int Offset) {
3053  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3054 
3055  // Skip out of bounds offsets, or else we would end up using an undefined
3056  // register.
3057  if (Offset >= NumElts || Offset < 0)
3058  return std::make_pair(AMDGPU::sub0, Offset);
3059 
3060  return std::make_pair(AMDGPU::sub0 + Offset, 0);
3061 }
3062 
3063 // Return true if the index is an SGPR and was set.
3066  MachineInstr &MI,
3067  int Offset,
3068  bool UseGPRIdxMode,
3069  bool IsIndirectSrc) {
3070  MachineBasicBlock *MBB = MI.getParent();
3071  const DebugLoc &DL = MI.getDebugLoc();
3073 
3074  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3075  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3076 
3077  assert(Idx->getReg() != AMDGPU::NoRegister);
3078 
3079  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3080  return false;
3081 
3082  if (UseGPRIdxMode) {
3083  unsigned IdxMode = IsIndirectSrc ?
3085  if (Offset == 0) {
3086  MachineInstr *SetOn =
3087  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3088  .add(*Idx)
3089  .addImm(IdxMode);
3090 
3091  SetOn->getOperand(3).setIsUndef();
3092  } else {
3093  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3094  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3095  .add(*Idx)
3096  .addImm(Offset);
3097  MachineInstr *SetOn =
3098  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3099  .addReg(Tmp, RegState::Kill)
3100  .addImm(IdxMode);
3101 
3102  SetOn->getOperand(3).setIsUndef();
3103  }
3104 
3105  return true;
3106  }
3107 
3108  if (Offset == 0) {
3109  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3110  .add(*Idx);
3111  } else {
3112  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3113  .add(*Idx)
3114  .addImm(Offset);
3115  }
3116 
3117  return true;
3118 }
3119 
3120 // Control flow needs to be inserted if indexing with a VGPR.
3122  MachineBasicBlock &MBB,
3123  const GCNSubtarget &ST) {
3124  const SIInstrInfo *TII = ST.getInstrInfo();
3125  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3126  MachineFunction *MF = MBB.getParent();
3128 
3129  unsigned Dst = MI.getOperand(0).getReg();
3130  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3131  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3132 
3133  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3134 
3135  unsigned SubReg;
3136  std::tie(SubReg, Offset)
3137  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3138 
3139  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3140 
3141  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3143  const DebugLoc &DL = MI.getDebugLoc();
3144 
3145  if (UseGPRIdxMode) {
3146  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3147  // to avoid interfering with other uses, so probably requires a new
3148  // optimization pass.
3149  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3150  .addReg(SrcReg, RegState::Undef, SubReg)
3151  .addReg(SrcReg, RegState::Implicit)
3152  .addReg(AMDGPU::M0, RegState::Implicit);
3153  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3154  } else {
3155  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3156  .addReg(SrcReg, RegState::Undef, SubReg)
3157  .addReg(SrcReg, RegState::Implicit);
3158  }
3159 
3160  MI.eraseFromParent();
3161 
3162  return &MBB;
3163  }
3164 
3165  const DebugLoc &DL = MI.getDebugLoc();
3167 
3168  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3169  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3170 
3171  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3172 
3173  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3174  Offset, UseGPRIdxMode, true);
3175  MachineBasicBlock *LoopBB = InsPt->getParent();
3176 
3177  if (UseGPRIdxMode) {
3178  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3179  .addReg(SrcReg, RegState::Undef, SubReg)
3180  .addReg(SrcReg, RegState::Implicit)
3181  .addReg(AMDGPU::M0, RegState::Implicit);
3182  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3183  } else {
3184  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3185  .addReg(SrcReg, RegState::Undef, SubReg)
3186  .addReg(SrcReg, RegState::Implicit);
3187  }
3188 
3189  MI.eraseFromParent();
3190 
3191  return LoopBB;
3192 }
3193 
3194 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3195  const TargetRegisterClass *VecRC) {
3196  switch (TRI.getRegSizeInBits(*VecRC)) {
3197  case 32: // 4 bytes
3198  return AMDGPU::V_MOVRELD_B32_V1;
3199  case 64: // 8 bytes
3200  return AMDGPU::V_MOVRELD_B32_V2;
3201  case 128: // 16 bytes
3202  return AMDGPU::V_MOVRELD_B32_V4;
3203  case 256: // 32 bytes
3204  return AMDGPU::V_MOVRELD_B32_V8;
3205  case 512: // 64 bytes
3206  return AMDGPU::V_MOVRELD_B32_V16;
3207  default:
3208  llvm_unreachable("unsupported size for MOVRELD pseudos");
3209  }
3210 }
3211 
3213  MachineBasicBlock &MBB,
3214  const GCNSubtarget &ST) {
3215  const SIInstrInfo *TII = ST.getInstrInfo();
3216  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3217  MachineFunction *MF = MBB.getParent();
3219 
3220  unsigned Dst = MI.getOperand(0).getReg();
3221  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3222  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3223  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3224  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3225  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3226 
3227  // This can be an immediate, but will be folded later.
3228  assert(Val->getReg());
3229 
3230  unsigned SubReg;
3231  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3232  SrcVec->getReg(),
3233  Offset);
3234  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3235 
3236  if (Idx->getReg() == AMDGPU::NoRegister) {
3238  const DebugLoc &DL = MI.getDebugLoc();
3239 
3240  assert(Offset == 0);
3241 
3242  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3243  .add(*SrcVec)
3244  .add(*Val)
3245  .addImm(SubReg);
3246 
3247  MI.eraseFromParent();
3248  return &MBB;
3249  }
3250 
3251  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3253  const DebugLoc &DL = MI.getDebugLoc();
3254 
3255  if (UseGPRIdxMode) {
3256  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3257  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3258  .add(*Val)
3259  .addReg(Dst, RegState::ImplicitDefine)
3260  .addReg(SrcVec->getReg(), RegState::Implicit)
3261  .addReg(AMDGPU::M0, RegState::Implicit);
3262 
3263  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3264  } else {
3265  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3266 
3267  BuildMI(MBB, I, DL, MovRelDesc)
3268  .addReg(Dst, RegState::Define)
3269  .addReg(SrcVec->getReg())
3270  .add(*Val)
3271  .addImm(SubReg - AMDGPU::sub0);
3272  }
3273 
3274  MI.eraseFromParent();
3275  return &MBB;
3276  }
3277 
3278  if (Val->isReg())
3279  MRI.clearKillFlags(Val->getReg());
3280 
3281  const DebugLoc &DL = MI.getDebugLoc();
3282 
3283  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3284 
3285  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3286  Offset, UseGPRIdxMode, false);
3287  MachineBasicBlock *LoopBB = InsPt->getParent();
3288 
3289  if (UseGPRIdxMode) {
3290  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3291  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3292  .add(*Val) // src0
3294  .addReg(PhiReg, RegState::Implicit)
3295  .addReg(AMDGPU::M0, RegState::Implicit);
3296  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3297  } else {
3298  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3299 
3300  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3301  .addReg(Dst, RegState::Define)
3302  .addReg(PhiReg)
3303  .add(*Val)
3304  .addImm(SubReg - AMDGPU::sub0);
3305  }
3306 
3307  MI.eraseFromParent();
3308 
3309  return LoopBB;
3310 }
3311 
3313  MachineInstr &MI, MachineBasicBlock *BB) const {
3314 
3315  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3316  MachineFunction *MF = BB->getParent();
3318 
3319  if (TII->isMIMG(MI)) {
3320  if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3321  report_fatal_error("missing mem operand from MIMG instruction");
3322  }
3323  // Add a memoperand for mimg instructions so that they aren't assumed to
3324  // be ordered memory instuctions.
3325 
3326  return BB;
3327  }
3328 
3329  switch (MI.getOpcode()) {
3330  case AMDGPU::S_ADD_U64_PSEUDO:
3331  case AMDGPU::S_SUB_U64_PSEUDO: {
3333  const DebugLoc &DL = MI.getDebugLoc();
3334 
3335  MachineOperand &Dest = MI.getOperand(0);
3336  MachineOperand &Src0 = MI.getOperand(1);
3337  MachineOperand &Src1 = MI.getOperand(2);
3338 
3339  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3340  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3341 
3342  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3343  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3344  &AMDGPU::SReg_32_XM0RegClass);
3345  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3346  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3347  &AMDGPU::SReg_32_XM0RegClass);
3348 
3349  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3350  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3351  &AMDGPU::SReg_32_XM0RegClass);
3352  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3353  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3354  &AMDGPU::SReg_32_XM0RegClass);
3355 
3356  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3357 
3358  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3359  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3360  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3361  .add(Src0Sub0)
3362  .add(Src1Sub0);
3363  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3364  .add(Src0Sub1)
3365  .add(Src1Sub1);
3366  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3367  .addReg(DestSub0)
3368  .addImm(AMDGPU::sub0)
3369  .addReg(DestSub1)
3370  .addImm(AMDGPU::sub1);
3371  MI.eraseFromParent();
3372  return BB;
3373  }
3374  case AMDGPU::SI_INIT_M0: {
3375  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3376  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3377  .add(MI.getOperand(0));
3378  MI.eraseFromParent();
3379  return BB;
3380  }
3381  case AMDGPU::SI_INIT_EXEC:
3382  // This should be before all vector instructions.
3383  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3384  AMDGPU::EXEC)
3385  .addImm(MI.getOperand(0).getImm());
3386  MI.eraseFromParent();
3387  return BB;
3388 
3389  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3390  // Extract the thread count from an SGPR input and set EXEC accordingly.
3391  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3392  //
3393  // S_BFE_U32 count, input, {shift, 7}
3394  // S_BFM_B64 exec, count, 0
3395  // S_CMP_EQ_U32 count, 64
3396  // S_CMOV_B64 exec, -1
3397  MachineInstr *FirstMI = &*BB->begin();
3399  unsigned InputReg = MI.getOperand(0).getReg();
3400  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3401  bool Found = false;
3402 
3403  // Move the COPY of the input reg to the beginning, so that we can use it.
3404  for (auto I = BB->begin(); I != &MI; I++) {
3405  if (I->getOpcode() != TargetOpcode::COPY ||
3406  I->getOperand(0).getReg() != InputReg)
3407  continue;
3408 
3409  if (I == FirstMI) {
3410  FirstMI = &*++BB->begin();
3411  } else {
3412  I->removeFromParent();
3413  BB->insert(FirstMI, &*I);
3414  }
3415  Found = true;
3416  break;
3417  }
3418  assert(Found);
3419  (void)Found;
3420 
3421  // This should be before all vector instructions.
3422  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3423  .addReg(InputReg)
3424  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3425  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3426  AMDGPU::EXEC)
3427  .addReg(CountReg)
3428  .addImm(0);
3429  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3430  .addReg(CountReg, RegState::Kill)
3431  .addImm(64);
3432  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3433  AMDGPU::EXEC)
3434  .addImm(-1);
3435  MI.eraseFromParent();
3436  return BB;
3437  }
3438 
3439  case AMDGPU::GET_GROUPSTATICSIZE: {
3440  DebugLoc DL = MI.getDebugLoc();
3441  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3442  .add(MI.getOperand(0))
3443  .addImm(MFI->getLDSSize());
3444  MI.eraseFromParent();
3445  return BB;
3446  }
3447  case AMDGPU::SI_INDIRECT_SRC_V1:
3448  case AMDGPU::SI_INDIRECT_SRC_V2:
3449  case AMDGPU::SI_INDIRECT_SRC_V4:
3450  case AMDGPU::SI_INDIRECT_SRC_V8:
3451  case AMDGPU::SI_INDIRECT_SRC_V16:
3452  return emitIndirectSrc(MI, *BB, *getSubtarget());
3453  case AMDGPU::SI_INDIRECT_DST_V1:
3454  case AMDGPU::SI_INDIRECT_DST_V2:
3455  case AMDGPU::SI_INDIRECT_DST_V4:
3456  case AMDGPU::SI_INDIRECT_DST_V8:
3457  case AMDGPU::SI_INDIRECT_DST_V16:
3458  return emitIndirectDst(MI, *BB, *getSubtarget());
3459  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3460  case AMDGPU::SI_KILL_I1_PSEUDO:
3461  return splitKillBlock(MI, BB);
3462  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3464 
3465  unsigned Dst = MI.getOperand(0).getReg();
3466  unsigned Src0 = MI.getOperand(1).getReg();
3467  unsigned Src1 = MI.getOperand(2).getReg();
3468  const DebugLoc &DL = MI.getDebugLoc();
3469  unsigned SrcCond = MI.getOperand(3).getReg();
3470 
3471  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3472  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3473  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3474 
3475  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3476  .addReg(SrcCond);
3477  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3478  .addImm(0)
3479  .addReg(Src0, 0, AMDGPU::sub0)
3480  .addImm(0)
3481  .addReg(Src1, 0, AMDGPU::sub0)
3482  .addReg(SrcCondCopy);
3483  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3484  .addImm(0)
3485  .addReg(Src0, 0, AMDGPU::sub1)
3486  .addImm(0)
3487  .addReg(Src1, 0, AMDGPU::sub1)
3488  .addReg(SrcCondCopy);
3489 
3490  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3491  .addReg(DstLo)
3492  .addImm(AMDGPU::sub0)
3493  .addReg(DstHi)
3494  .addImm(AMDGPU::sub1);
3495  MI.eraseFromParent();
3496  return BB;
3497  }
3498  case AMDGPU::SI_BR_UNDEF: {
3499  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3500  const DebugLoc &DL = MI.getDebugLoc();
3501  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3502  .add(MI.getOperand(0));
3503  Br->getOperand(1).setIsUndef(true); // read undef SCC
3504  MI.eraseFromParent();
3505  return BB;
3506  }
3507  case AMDGPU::ADJCALLSTACKUP:
3508  case AMDGPU::ADJCALLSTACKDOWN: {
3510  MachineInstrBuilder MIB(*MF, &MI);
3511 
3512  // Add an implicit use of the frame offset reg to prevent the restore copy
3513  // inserted after the call from being reorderd after stack operations in the
3514  // the caller's frame.
3515  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3516  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3517  .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3518  return BB;
3519  }
3520  case AMDGPU::SI_CALL_ISEL: {
3521  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3522  const DebugLoc &DL = MI.getDebugLoc();
3523 
3524  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3525 
3526  MachineInstrBuilder MIB;
3527  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
3528 
3529  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3530  MIB.add(MI.getOperand(I));
3531 
3532  MIB.cloneMemRefs(MI);
3533  MI.eraseFromParent();
3534  return BB;
3535  }
3536  default:
3538  }
3539 }
3540 
3542  return isTypeLegal(VT.getScalarType());
3543 }
3544 
3546  // This currently forces unfolding various combinations of fsub into fma with
3547  // free fneg'd operands. As long as we have fast FMA (controlled by
3548  // isFMAFasterThanFMulAndFAdd), we should perform these.
3549 
3550  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3551  // most of these combines appear to be cycle neutral but save on instruction
3552  // count / code size.
3553  return true;
3554 }
3555 
3557  EVT VT) const {
3558  if (!VT.isVector()) {
3559  return MVT::i1;
3560  }
3561  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3562 }
3563 
3565  // TODO: Should i16 be used always if legal? For now it would force VALU
3566  // shifts.
3567  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3568 }
3569 
3570 // Answering this is somewhat tricky and depends on the specific device which
3571 // have different rates for fma or all f64 operations.
3572 //
3573 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3574 // regardless of which device (although the number of cycles differs between
3575 // devices), so it is always profitable for f64.
3576 //
3577 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3578 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3579 // which we can always do even without fused FP ops since it returns the same
3580 // result as the separate operations and since it is always full
3581 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3582 // however does not support denormals, so we do report fma as faster if we have
3583 // a fast fma device and require denormals.
3584 //
3586  VT = VT.getScalarType();
3587 
3588  switch (VT.getSimpleVT().SimpleTy) {
3589  case MVT::f32: {
3590  // This is as fast on some subtargets. However, we always have full rate f32
3591  // mad available which returns the same result as the separate operations
3592  // which we should prefer over fma. We can't use this if we want to support
3593  // denormals, so only report this in these cases.
3594  if (Subtarget->hasFP32Denormals())
3595  return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3596 
3597  // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3598  return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3599  }
3600  case MVT::f64:
3601  return true;
3602  case MVT::f16:
3603  return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3604  default:
3605  break;
3606  }
3607 
3608  return false;
3609 }
3610 
3611 //===----------------------------------------------------------------------===//
3612 // Custom DAG Lowering Operations
3613 //===----------------------------------------------------------------------===//
3614 
3615 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3616 // wider vector type is legal.
3618  SelectionDAG &DAG) const {
3619  unsigned Opc = Op.getOpcode();
3620  EVT VT = Op.getValueType();
3621  assert(VT == MVT::v4f16);
3622 
3623  SDValue Lo, Hi;
3624  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3625 
3626  SDLoc SL(Op);
3627  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3628  Op->getFlags());
3629  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3630  Op->getFlags());
3631 
3632  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3633 }
3634 
3635 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3636 // wider vector type is legal.
3638  SelectionDAG &DAG) const {
3639  unsigned Opc = Op.getOpcode();
3640  EVT VT = Op.getValueType();
3641  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3642 
3643  SDValue Lo0, Hi0;
3644  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3645  SDValue Lo1, Hi1;
3646  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3647 
3648  SDLoc SL(Op);
3649 
3650  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3651  Op->getFlags());
3652  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3653  Op->getFlags());
3654 
3655  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3656 }
3657 
3659  switch (Op.getOpcode()) {
3660  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3661  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3662  case ISD::LOAD: {
3663  SDValue Result = LowerLOAD(Op, DAG);
3664  assert((!Result.getNode() ||
3665  Result.getNode()->getNumValues() == 2) &&
3666  "Load should return a value and a chain");
3667  return Result;
3668  }
3669 
3670  case ISD::FSIN:
3671  case ISD::FCOS:
3672  return LowerTrig(Op, DAG);
3673  case ISD::SELECT: return LowerSELECT(Op, DAG);
3674  case ISD::FDIV: return LowerFDIV(Op, DAG);
3675  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3676  case ISD::STORE: return LowerSTORE(Op, DAG);
3677  case ISD::GlobalAddress: {
3678  MachineFunction &MF = DAG.getMachineFunction();
3680  return LowerGlobalAddress(MFI, Op, DAG);
3681  }
3682  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3683  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3684  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3685  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3687  return lowerINSERT_VECTOR_ELT(Op, DAG);
3689  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3690  case ISD::BUILD_VECTOR:
3691  return lowerBUILD_VECTOR(Op, DAG);
3692  case ISD::FP_ROUND:
3693  return lowerFP_ROUND(Op, DAG);
3694  case ISD::TRAP:
3695  return lowerTRAP(Op, DAG);
3696  case ISD::DEBUGTRAP:
3697  return lowerDEBUGTRAP(Op, DAG);
3698  case ISD::FABS:
3699  case ISD::FNEG:
3700  case ISD::FCANONICALIZE:
3701  return splitUnaryVectorOp(Op, DAG);
3702  case ISD::FMINNUM:
3703  case ISD::FMAXNUM:
3704  return lowerFMINNUM_FMAXNUM(Op, DAG);
3705  case ISD::SHL:
3706  case ISD::SRA:
3707  case ISD::SRL:
3708  case ISD::ADD:
3709  case ISD::SUB:
3710  case ISD::MUL:
3711  case ISD::SMIN:
3712  case ISD::SMAX:
3713  case ISD::UMIN:
3714  case ISD::UMAX:
3715  case ISD::FADD:
3716  case ISD::FMUL:
3717  case ISD::FMINNUM_IEEE:
3718  case ISD::FMAXNUM_IEEE:
3719  return splitBinaryVectorOp(Op, DAG);
3720  }
3721  return SDValue();
3722 }
3723 
3725  const SDLoc &DL,
3726  SelectionDAG &DAG, bool Unpacked) {
3727  if (!LoadVT.isVector())
3728  return Result;
3729 
3730  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3731  // Truncate to v2i16/v4i16.
3732  EVT IntLoadVT = LoadVT.changeTypeToInteger();
3733 
3734  // Workaround legalizer not scalarizing truncate after vector op
3735  // legalization byt not creating intermediate vector trunc.
3737  DAG.ExtractVectorElements(Result, Elts);
3738  for (SDValue &Elt : Elts)
3739  Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3740 
3741  Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3742 
3743  // Bitcast to original type (v2f16/v4f16).
3744  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3745  }
3746 
3747  // Cast back to the original packed type.
3748  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3749 }
3750 
3751 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3752  MemSDNode *M,
3753  SelectionDAG &DAG,
3754  ArrayRef<SDValue> Ops,
3755  bool IsIntrinsic) const {
3756  SDLoc DL(M);
3757 
3758  bool Unpacked = Subtarget->hasUnpackedD16VMem();
3759  EVT LoadVT = M->getValueType(0);
3760 
3761  EVT EquivLoadVT = LoadVT;
3762  if (Unpacked && LoadVT.isVector()) {
3763  EquivLoadVT = LoadVT.isVector() ?
3765  LoadVT.getVectorNumElements()) : LoadVT;
3766  }
3767 
3768  // Change from v4f16/v2f16 to EquivLoadVT.
3769  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3770 
3771  SDValue Load
3772  = DAG.getMemIntrinsicNode(
3773  IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3774  VTList, Ops, M->getMemoryVT(),
3775  M->getMemOperand());
3776  if (!Unpacked) // Just adjusted the opcode.
3777  return Load;
3778 
3779  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3780 
3781  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3782 }
3783 
3785  SDNode *N, SelectionDAG &DAG) {
3786  EVT VT = N->getValueType(0);
3787  const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
3788  int CondCode = CD->getSExtValue();
3789  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3790  CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3791  return DAG.getUNDEF(VT);
3792 
3793  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3794 
3795 
3796  SDValue LHS = N->getOperand(1);
3797  SDValue RHS = N->getOperand(2);
3798 
3799  SDLoc DL(N);
3800 
3801  EVT CmpVT = LHS.getValueType();
3802  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3803  unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3805  LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3806  RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3807  }
3808 
3809  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3810 
3811  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3812  DAG.getCondCode(CCOpcode));
3813 }
3814 
3816  SDNode *N, SelectionDAG &DAG) {
3817  EVT VT = N->getValueType(0);
3818  const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
3819 
3820  int CondCode = CD->getSExtValue();
3821  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3822  CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3823  return DAG.getUNDEF(VT);
3824  }
3825 
3826  SDValue Src0 = N->getOperand(1);
3827  SDValue Src1 = N->getOperand(2);
3828  EVT CmpVT = Src0.getValueType();
3829  SDLoc SL(N);
3830 
3831  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3832  Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3833  Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3834  }
3835 
3836  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3837  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3838  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3839  Src1, DAG.getCondCode(CCOpcode));
3840 }
3841 
3844  SelectionDAG &DAG) const {
3845  switch (N->getOpcode()) {
3846  case ISD::INSERT_VECTOR_ELT: {
3847  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3848  Results.push_back(Res);
3849  return;
3850  }
3851  case ISD::EXTRACT_VECTOR_ELT: {
3852  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3853  Results.push_back(Res);
3854  return;
3855  }
3856  case ISD::INTRINSIC_WO_CHAIN: {
3857  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3858  switch (IID) {
3859  case Intrinsic::amdgcn_cvt_pkrtz: {
3860  SDValue Src0 = N->getOperand(1);
3861  SDValue Src1 = N->getOperand(2);
3862  SDLoc SL(N);
3864  Src0, Src1);
3865  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3866  return;
3867  }
3868  case Intrinsic::amdgcn_cvt_pknorm_i16:
3869  case Intrinsic::amdgcn_cvt_pknorm_u16:
3870  case Intrinsic::amdgcn_cvt_pk_i16:
3871  case Intrinsic::amdgcn_cvt_pk_u16: {
3872  SDValue Src0 = N->getOperand(1);
3873  SDValue Src1 = N->getOperand(2);
3874  SDLoc SL(N);
3875  unsigned Opcode;
3876 
3877  if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3879  else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3881  else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3882  Opcode = AMDGPUISD::CVT_PK_I16_I32;
3883  else
3884  Opcode = AMDGPUISD::CVT_PK_U16_U32;
3885 
3886  EVT VT = N->getValueType(0);
3887  if (isTypeLegal(VT))
3888  Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3889  else {
3890  SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3891  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3892  }
3893  return;
3894  }
3895  }
3896  break;
3897  }
3898  case ISD::INTRINSIC_W_CHAIN: {
3899  if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3900  Results.push_back(Res);
3901  Results.push_back(Res.getValue(1));
3902  return;
3903  }
3904 
3905  break;
3906  }
3907  case ISD::SELECT: {
3908  SDLoc SL(N);
3909  EVT VT = N->getValueType(0);
3910  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3911  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3912  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3913 
3914  EVT SelectVT = NewVT;
3915  if (NewVT.bitsLT(MVT::i32)) {
3916  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3917  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3918  SelectVT = MVT::i32;
3919  }
3920 
3921  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3922  N->getOperand(0), LHS, RHS);
3923 
3924  if (NewVT != SelectVT)
3925  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3926  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3927  return;
3928  }
3929  case ISD::FNEG: {
3930  if (N->getValueType(0) != MVT::v2f16)
3931  break;
3932 
3933  SDLoc SL(N);
3934  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3935 
3936  SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3937  BC,
3938  DAG.getConstant(0x80008000, SL, MVT::i32));
3939  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3940  return;
3941  }
3942  case ISD::FABS: {
3943  if (N->getValueType(0) != MVT::v2f16)
3944  break;
3945 
3946  SDLoc SL(N);
3947  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3948 
3949  SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3950  BC,
3951  DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3952  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3953  return;
3954  }
3955  default:
3956  break;
3957  }
3958 }
3959 
3960 /// Helper function for LowerBRCOND
3961 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3962 
3963  SDNode *Parent = Value.getNode();
3964  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3965  I != E; ++I) {
3966 
3967  if (I.getUse().get() != Value)
3968  continue;
3969 
3970  if (I->getOpcode() == Opcode)
3971  return *I;
3972  }
3973  return nullptr;
3974 }
3975 
3976 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3977  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3978  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3979  case Intrinsic::amdgcn_if:
3980  return AMDGPUISD::IF;
3981  case Intrinsic::amdgcn_else:
3982  return AMDGPUISD::ELSE;
3983  case Intrinsic::amdgcn_loop:
3984  return AMDGPUISD::LOOP;
3985  case Intrinsic::amdgcn_end_cf:
3986  llvm_unreachable("should not occur");
3987  default:
3988  return 0;
3989  }
3990  }
3991 
3992  // break, if_break, else_break are all only used as inputs to loop, not
3993  // directly as branch conditions.
3994  return 0;
3995 }
3996 
3997 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3998  const Triple &TT = getTargetMachine().getTargetTriple();
3999  return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4002 }
4003 
4004 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
4005  // FIXME: Either avoid relying on address space here or change the default
4006  // address space for functions to avoid the explicit check.
4007  return (GV->getValueType()->isFunctionTy() ||
4011  !shouldEmitFixup(GV) &&
4013 }
4014 
4015 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4016  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4017 }
4018 
4019 /// This transforms the control flow intrinsics to get the branch destination as
4020 /// last parameter, also switches branch target with BR if the need arise
4021 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4022  SelectionDAG &DAG) const {
4023  SDLoc DL(BRCOND);
4024 
4025  SDNode *Intr = BRCOND.getOperand(1).getNode();
4026  SDValue Target = BRCOND.getOperand(2);
4027  SDNode *BR = nullptr;
4028  SDNode *SetCC = nullptr;
4029 
4030  if (Intr->getOpcode() == ISD::SETCC) {
4031  // As long as we negate the condition everything is fine
4032  SetCC = Intr;
4033  Intr = SetCC->getOperand(0).getNode();
4034 
4035  } else {
4036  // Get the target from BR if we don't negate the condition
4037  BR = findUser(BRCOND, ISD::BR);
4038  Target = BR->getOperand(1);
4039  }
4040 
4041  // FIXME: This changes the types of the intrinsics instead of introducing new
4042  // nodes with the correct types.
4043  // e.g. llvm.amdgcn.loop
4044 
4045  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4046  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4047 
4048  unsigned CFNode = isCFIntrinsic(Intr);
4049  if (CFNode == 0) {
4050  // This is a uniform branch so we don't need to legalize.
4051  return BRCOND;
4052  }
4053 
4054  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4055  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4056 
4057  assert(!SetCC ||
4058  (SetCC->getConstantOperandVal(1) == 1 &&
4059  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4060  ISD::SETNE));
4061 
4062  // operands of the new intrinsic call
4064  if (HaveChain)
4065  Ops.push_back(BRCOND.getOperand(0));
4066 
4067  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
4068  Ops.push_back(Target);
4069 
4070  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4071 
4072  // build the new intrinsic call
4073  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4074 
4075  if (!HaveChain) {
4076  SDValue Ops[] = {
4077  SDValue(Result, 0),
4078  BRCOND.getOperand(0)
4079  };
4080 
4081  Result = DAG.getMergeValues(Ops, DL).getNode();
4082  }
4083 
4084  if (BR) {
4085  // Give the branch instruction our target
4086  SDValue Ops[] = {
4087  BR->getOperand(0),
4088  BRCOND.getOperand(2)
4089  };
4090  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4091  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4092  BR = NewBR.getNode();
4093  }
4094 
4095  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4096 
4097  // Copy the intrinsic results to registers
4098  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4100  if (!CopyToReg)
4101  continue;
4102 
4103  Chain = DAG.getCopyToReg(
4104  Chain, DL,
4105  CopyToReg->getOperand(1),
4106  SDValue(Result, i - 1),
4107  SDValue());
4108 
4109  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4110  }
4111 
4112  // Remove the old intrinsic from the chain
4114  SDValue(Intr, Intr->getNumValues() - 1),
4115  Intr->getOperand(0));
4116 
4117  return Chain;
4118 }
4119 
4120 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4121  SDValue Op,
4122  const SDLoc &DL,
4123  EVT VT) const {
4124  return Op.getValueType().bitsLE(VT) ?
4125  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4126  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4127 }
4128 
4129 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4130  assert(Op.getValueType() == MVT::f16 &&
4131  "Do not know how to custom lower FP_ROUND for non-f16 type");
4132 
4133  SDValue Src = Op.getOperand(0);
4134  EVT SrcVT = Src.getValueType();
4135  if (SrcVT != MVT::f64)
4136  return Op;
4137 
4138  SDLoc DL(Op);
4139 
4140  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4141  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4142  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4143 }
4144 
4145 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4146  SelectionDAG &DAG) const {
4147  EVT VT = Op.getValueType();
4148  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4149 
4150  // FIXME: Assert during eslection that this is only selected for
4151  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4152  // mode functions, but this happens to be OK since it's only done in cases
4153  // where there is known no sNaN.
4154  if (IsIEEEMode)
4155  return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4156 
4157  if (VT == MVT::v4f16)
4158  return splitBinaryVectorOp(Op, DAG);
4159  return Op;
4160 }
4161 
4162 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4163  SDLoc SL(Op);
4164  SDValue Chain = Op.getOperand(0);
4165 
4166  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4167  !Subtarget->isTrapHandlerEnabled())
4168  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4169 
4170  MachineFunction &MF = DAG.getMachineFunction();
4172  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4173  assert(UserSGPR != AMDGPU::NoRegister);
4174  SDValue QueuePtr = CreateLiveInRegister(
4175  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4176  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4177  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4178  QueuePtr, SDValue());
4179  SDValue Ops[] = {
4180  ToReg,
4182  SGPR01,
4183  ToReg.getValue(1)
4184  };
4185  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4186 }
4187 
4188 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4189  SDLoc SL(Op);
4190  SDValue Chain = Op.getOperand(0);
4191  MachineFunction &MF = DAG.getMachineFunction();
4192 
4193  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4194  !Subtarget->isTrapHandlerEnabled()) {
4196  "debugtrap handler not supported",
4197  Op.getDebugLoc(),
4198  DS_Warning);
4199  LLVMContext &Ctx = MF.getFunction().getContext();
4200  Ctx.diagnose(NoTrap);
4201  return Chain;
4202  }
4203 
4204  SDValue Ops[] = {
4205  Chain,
4207  };
4208  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4209 }
4210 
4211 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4212  SelectionDAG &DAG) const {
4213  // FIXME: Use inline constants (src_{shared, private}_base) instead.
4214  if (Subtarget->hasApertureRegs()) {
4215  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4218  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4221  unsigned Encoding =
4223  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4224  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4225 
4226  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4227  SDValue ApertureReg = SDValue(
4228  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4229  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4230  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4231  }
4232 
4233  MachineFunction &MF = DAG.getMachineFunction();
4235  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4236  assert(UserSGPR != AMDGPU::NoRegister);
4237 
4238  SDValue QueuePtr = CreateLiveInRegister(
4239  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4240 
4241  // Offset into amd_queue_t for group_segment_aperture_base_hi /
4242  // private_segment_aperture_base_hi.
4243  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4244 
4245  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4246 
4247  // TODO: Use custom target PseudoSourceValue.
4248  // TODO: We should use the value from the IR intrinsic call, but it might not
4249  // be available and how do we get it?
4252 
4253  MachinePointerInfo PtrInfo(V, StructOffset);
4254  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4255  MinAlign(64, StructOffset),
4258 }
4259 
4260 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4261  SelectionDAG &DAG) const {
4262  SDLoc SL(Op);
4263  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4264 
4265  SDValue Src = ASC->getOperand(0);
4266  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4267 
4268  const AMDGPUTargetMachine &TM =
4269  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4270 
4271  // flat -> local/private
4273  unsigned DestAS = ASC->getDestAddressSpace();
4274 
4275  if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4276  DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4277  unsigned NullVal = TM.getNullPointerValue(DestAS);
4278  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4279  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4280  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4281 
4282  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4283  NonNull, Ptr, SegmentNullPtr);
4284  }
4285  }
4286 
4287  // local/private -> flat
4289  unsigned SrcAS = ASC->getSrcAddressSpace();
4290 
4291  if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4292  SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4293  unsigned NullVal = TM.getNullPointerValue(SrcAS);
4294  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4295 
4296  SDValue NonNull
4297  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4298 
4299  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4300  SDValue CvtPtr
4301  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4302 
4303  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4304  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4305  FlatNullPtr);
4306  }
4307  }
4308 
4309  // global <-> flat are no-ops and never emitted.
4310 
4311  const MachineFunction &MF = DAG.getMachineFunction();
4312  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4313  MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4314  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4315 
4316  return DAG.getUNDEF(ASC->getValueType(0));
4317 }
4318 
4319 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4320  SelectionDAG &DAG) const {
4321  SDValue Vec = Op.getOperand(0);
4322  SDValue InsVal = Op.getOperand(1);
4323  SDValue Idx = Op.getOperand(2);
4324  EVT VecVT = Vec.getValueType();
4325  EVT EltVT = VecVT.getVectorElementType();
4326  unsigned VecSize = VecVT.getSizeInBits();
4327  unsigned EltSize = EltVT.getSizeInBits();
4328 
4329 
4330  assert(VecSize <= 64);
4331 
4332  unsigned NumElts = VecVT.getVectorNumElements();
4333  SDLoc SL(Op);
4334  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4335 
4336  if (NumElts == 4 && EltSize == 16 && KIdx) {
4337  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4338 
4339  SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4340  DAG.getConstant(0, SL, MVT::i32));
4341  SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4342  DAG.getConstant(1, SL, MVT::i32));
4343 
4344  SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4345  SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4346 
4347  unsigned Idx = KIdx->getZExtValue();
4348  bool InsertLo = Idx < 2;
4349  SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4350  InsertLo ? LoVec : HiVec,
4351  DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4352  DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4353 
4354  InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4355 
4356  SDValue Concat = InsertLo ?
4357  DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4358  DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4359 
4360  return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4361  }
4362 
4363  if (isa<ConstantSDNode>(Idx))
4364  return SDValue();
4365 
4366  MVT IntVT = MVT::getIntegerVT(VecSize);
4367 
4368  // Avoid stack access for dynamic indexing.
4369  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4370 
4371  // Create a congruent vector with the target value in each element so that
4372  // the required element can be masked and ORed into the target vector.
4373  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4374  DAG.getSplatBuildVector(VecVT, SL, InsVal));
4375 
4376  assert(isPowerOf2_32(EltSize));
4377  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4378 
4379  // Convert vector index to bit-index.
4380  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4381 
4382  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4383  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4384  DAG.getConstant(0xffff, SL, IntVT),
4385  ScaledIdx);
4386 
4387  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4388  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4389  DAG.getNOT(SL, BFM, IntVT), BCVec);
4390 
4391  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4392  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4393 }
4394 
4395 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4396  SelectionDAG &DAG) const {
4397  SDLoc SL(Op);
4398 
4399  EVT ResultVT = Op.getValueType();
4400  SDValue Vec = Op.getOperand(0);
4401  SDValue Idx = Op.getOperand(1);
4402  EVT VecVT = Vec.getValueType();
4403  unsigned VecSize = VecVT.getSizeInBits();
4404  EVT EltVT = VecVT.getVectorElementType();
4405  assert(VecSize <= 64);
4406 
4407  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4408 
4409  // Make sure we do any optimizations that will make it easier to fold
4410  // source modifiers before obscuring it with bit operations.
4411 
4412  // XXX - Why doesn't this get called when vector_shuffle is expanded?
4413  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4414  return Combined;
4415 
4416  unsigned EltSize = EltVT.getSizeInBits();
4417  assert(isPowerOf2_32(EltSize));
4418 
4419  MVT IntVT = MVT::getIntegerVT(VecSize);
4420  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4421 
4422  // Convert vector inde