LLVM  7.0.0svn
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This is the parent TargetLowering class for hardware code gen
12 /// targets.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
17 #define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
18 #define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
19 
20 #include "AMDGPUISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUCallLowering.h"
23 #include "AMDGPUFrameLowering.h"
24 #include "AMDGPUIntrinsicInfo.h"
25 #include "AMDGPURegisterInfo.h"
26 #include "AMDGPUSubtarget.h"
27 #include "AMDGPUTargetMachine.h"
29 #include "SIInstrInfo.h"
30 #include "SIMachineFunctionInfo.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/Support/KnownBits.h"
39 using namespace llvm;
40 
41 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
42  CCValAssign::LocInfo LocInfo,
43  ISD::ArgFlagsTy ArgFlags, CCState &State) {
44  MachineFunction &MF = State.getMachineFunction();
46 
47  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
48  ArgFlags.getOrigAlign());
49  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
50  return true;
51 }
52 
53 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
54  CCValAssign::LocInfo LocInfo,
55  ISD::ArgFlagsTy ArgFlags, CCState &State,
56  const TargetRegisterClass *RC,
57  unsigned NumRegs) {
58  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
59  unsigned RegResult = State.AllocateReg(RegList);
60  if (RegResult == AMDGPU::NoRegister)
61  return false;
62 
63  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
64  return true;
65 }
66 
67 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
68  CCValAssign::LocInfo LocInfo,
69  ISD::ArgFlagsTy ArgFlags, CCState &State) {
70  switch (LocVT.SimpleTy) {
71  case MVT::i64:
72  case MVT::f64:
73  case MVT::v2i32:
74  case MVT::v2f32: {
75  // Up to SGPR0-SGPR39
76  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
77  &AMDGPU::SGPR_64RegClass, 20);
78  }
79  default:
80  return false;
81  }
82 }
83 
84 // Allocate up to VGPR31.
85 //
86 // TODO: Since there are no VGPR alignent requirements would it be better to
87 // split into individual scalar registers?
88 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
89  CCValAssign::LocInfo LocInfo,
90  ISD::ArgFlagsTy ArgFlags, CCState &State) {
91  switch (LocVT.SimpleTy) {
92  case MVT::i64:
93  case MVT::f64:
94  case MVT::v2i32:
95  case MVT::v2f32: {
96  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
97  &AMDGPU::VReg_64RegClass, 31);
98  }
99  case MVT::v4i32:
100  case MVT::v4f32:
101  case MVT::v2i64:
102  case MVT::v2f64: {
103  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
104  &AMDGPU::VReg_128RegClass, 29);
105  }
106  case MVT::v8i32:
107  case MVT::v8f32: {
108  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
109  &AMDGPU::VReg_256RegClass, 25);
110 
111  }
112  case MVT::v16i32:
113  case MVT::v16f32: {
114  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
115  &AMDGPU::VReg_512RegClass, 17);
116 
117  }
118  default:
119  return false;
120  }
121 }
122 
123 #include "AMDGPUGenCallingConv.inc"
124 
125 // Find a larger type to do a load / store of a vector with.
127  unsigned StoreSize = VT.getStoreSizeInBits();
128  if (StoreSize <= 32)
129  return EVT::getIntegerVT(Ctx, StoreSize);
130 
131  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
132  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
133 }
134 
136  KnownBits Known;
137  EVT VT = Op.getValueType();
138  DAG.computeKnownBits(Op, Known);
139 
140  return VT.getSizeInBits() - Known.countMinLeadingZeros();
141 }
142 
144  EVT VT = Op.getValueType();
145 
146  // In order for this to be a signed 24-bit value, bit 23, must
147  // be a sign bit.
148  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
149 }
150 
152  const AMDGPUSubtarget &STI)
153  : TargetLowering(TM), Subtarget(&STI) {
155  // Lower floating point store/load to integer store/load to reduce the number
156  // of patterns in tablegen.
159 
162 
165 
168 
171 
174 
177 
180 
183 
184  // There are no 64-bit extloads. These should be done as a 32-bit extload and
185  // an extension to 64-bit.
186  for (MVT VT : MVT::integer_valuetypes()) {
190  }
191 
192  for (MVT VT : MVT::integer_valuetypes()) {
193  if (VT == MVT::i64)
194  continue;
195 
200 
205 
210  }
211 
212  for (MVT VT : MVT::integer_vector_valuetypes()) {
225  }
226 
231 
236 
241 
244 
247 
250 
253 
256 
259 
262 
265 
268 
273 
278 
283 
286 
289 
292 
295 
296 
301 
304 
305  // This is totally unsupported, just custom lower to produce an error.
307 
308  // Library functions. These default to Expand, but we have instructions
309  // for them.
320 
323 
326 
327  if (Subtarget->has16BitInsts()) {
330  }
331 
334 
337 
338  // v_mad_f32 does not support denormals according to some sources.
339  if (!Subtarget->hasFP32Denormals())
341 
342  // Expand to fneg + fadd.
344 
355 
361  }
362 
363  if (!Subtarget->hasBFI()) {
364  // fcopysign can be done in a single instruction with BFI.
367  }
368 
372 
373  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
374  for (MVT VT : ScalarIntVTs) {
375  // These should use [SU]DIVREM, so set them to expand
380 
381  // GPU does not have divrem function for signed or unsigned.
384 
385  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
388 
392  }
393 
394  if (!Subtarget->hasBCNT(32))
396 
397  if (!Subtarget->hasBCNT(64))
399 
400  // The hardware supports 32-bit ROTR, but not ROTL.
402  setOperationAction(ISD::ROTL, MVT::i64, Expand);
403  setOperationAction(ISD::ROTR, MVT::i64, Expand);
404 
405  setOperationAction(ISD::MUL, MVT::i64, Expand);
413 
418 
419  if (Subtarget->hasFFBH())
421 
422  if (Subtarget->hasFFBL())
424 
425  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
427  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
429 
430  // We only really have 32-bit BFE instructions (and 16-bit on VI).
431  //
432  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
433  // effort to match them now. We want this to be false for i64 cases when the
434  // extraction isn't restricted to the upper or lower half. Ideally we would
435  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
436  // span the midpoint are probably relatively rare, so don't worry about them
437  // for now.
438  if (Subtarget->hasBFE())
439  setHasExtractBitsInsn(true);
440 
441  static const MVT::SimpleValueType VectorIntTypes[] = {
443  };
444 
445  for (MVT VT : VectorIntTypes) {
446  // Expand the following operations for the current type by default.
485  }
486 
487  static const MVT::SimpleValueType FloatVectorTypes[] = {
489  };
490 
491  for (MVT VT : FloatVectorTypes) {
520  }
521 
522  // This causes using an unrolled select operation rather than expansion with
523  // bit operations. This is in general better, but the alternative using BFI
524  // instructions may be better if the select sources are SGPRs.
527 
528  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
529  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
530 
531  // There are no libcalls of any kind.
532  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
533  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
534 
537 
539  setJumpIsExpensive(true);
540 
541  // FIXME: This is only partially true. If we have to do vector compares, any
542  // SGPR pair can be a condition register. If we have a uniform condition, we
543  // are better off doing SALU operations, where there is only one SCC. For now,
544  // we don't have a way of knowing during instruction selection if a condition
545  // will be uniform and we always use vector compares. Assume we are using
546  // vector compares until that is fixed.
548 
549  // SI at least has hardware support for floating point exceptions, but no way
550  // of using or handling them is implemented. They are also optional in OpenCL
551  // (Section 7.3)
553 
555 
556  // We want to find all load dependencies for long chains of stores to enable
557  // merging into very wide vectors. The problem is with vectors with > 4
558  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
559  // vectors are a legal type, even though we have to split the loads
560  // usually. When we can more precisely specify load legality per address
561  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
562  // smarter so that they can figure out what to do in 2 iterations without all
563  // N > 4 stores on the same chain.
565 
566  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
567  // about these during lowering.
568  MaxStoresPerMemcpy = 0xffffffff;
569  MaxStoresPerMemmove = 0xffffffff;
570  MaxStoresPerMemset = 0xffffffff;
571 
588 }
589 
590 //===----------------------------------------------------------------------===//
591 // Target Information
592 //===----------------------------------------------------------------------===//
593 
595 static bool fnegFoldsIntoOp(unsigned Opc) {
596  switch (Opc) {
597  case ISD::FADD:
598  case ISD::FSUB:
599  case ISD::FMUL:
600  case ISD::FMA:
601  case ISD::FMAD:
602  case ISD::FMINNUM:
603  case ISD::FMAXNUM:
604  case ISD::FSIN:
605  case ISD::FTRUNC:
606  case ISD::FRINT:
607  case ISD::FNEARBYINT:
608  case AMDGPUISD::RCP:
610  case AMDGPUISD::SIN_HW:
614  return true;
615  default:
616  return false;
617  }
618 }
619 
620 /// \p returns true if the operation will definitely need to use a 64-bit
621 /// encoding, and thus will use a VOP3 encoding regardless of the source
622 /// modifiers.
624 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
625  return N->getNumOperands() > 2 || VT == MVT::f64;
626 }
627 
628 // Most FP instructions support source modifiers, but this could be refined
629 // slightly.
631 static bool hasSourceMods(const SDNode *N) {
632  if (isa<MemSDNode>(N))
633  return false;
634 
635  switch (N->getOpcode()) {
636  case ISD::CopyToReg:
637  case ISD::SELECT:
638  case ISD::FDIV:
639  case ISD::FREM:
640  case ISD::INLINEASM:
644 
645  // TODO: Should really be looking at the users of the bitcast. These are
646  // problematic because bitcasts are used to legalize all stores to integer
647  // types.
648  case ISD::BITCAST:
649  return false;
650  default:
651  return true;
652  }
653 }
654 
656  unsigned CostThreshold) {
657  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
658  // it is truly free to use a source modifier in all cases. If there are
659  // multiple users but for each one will necessitate using VOP3, there will be
660  // a code size increase. Try to avoid increasing code size unless we know it
661  // will save on the instruction count.
662  unsigned NumMayIncreaseSize = 0;
663  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
664 
665  // XXX - Should this limit number of uses to check?
666  for (const SDNode *U : N->uses()) {
667  if (!hasSourceMods(U))
668  return false;
669 
670  if (!opMustUseVOP3Encoding(U, VT)) {
671  if (++NumMayIncreaseSize > CostThreshold)
672  return false;
673  }
674  }
675 
676  return true;
677 }
678 
680  return MVT::i32;
681 }
682 
684  return true;
685 }
686 
687 // The backend supports 32 and 64 bit floating point immediates.
688 // FIXME: Why are we reporting vectors of FP immediates as legal?
689 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
690  EVT ScalarVT = VT.getScalarType();
691  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
692  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
693 }
694 
695 // We don't want to shrink f64 / f32 constants.
697  EVT ScalarVT = VT.getScalarType();
698  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
699 }
700 
703  EVT NewVT) const {
704 
705  unsigned NewSize = NewVT.getStoreSizeInBits();
706 
707  // If we are reducing to a 32-bit load, this is always better.
708  if (NewSize == 32)
709  return true;
710 
711  EVT OldVT = N->getValueType(0);
712  unsigned OldSize = OldVT.getStoreSizeInBits();
713 
714  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
715  // extloads, so doing one requires using a buffer_load. In cases where we
716  // still couldn't use a scalar load, using the wider load shouldn't really
717  // hurt anything.
718 
719  // If the old size already had to be an extload, there's no harm in continuing
720  // to reduce the width.
721  return (OldSize < 32);
722 }
723 
725  EVT CastTy) const {
726 
727  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
728 
729  if (LoadTy.getScalarType() == MVT::i32)
730  return false;
731 
732  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
733  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
734 
735  return (LScalarSize < CastScalarSize) ||
736  (CastScalarSize >= 32);
737 }
738 
739 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
740 // profitable with the expansion for 64-bit since it's generally good to
741 // speculate things.
742 // FIXME: These should really have the size as a parameter.
744  return true;
745 }
746 
748  return true;
749 }
750 
751 //===---------------------------------------------------------------------===//
752 // Target Properties
753 //===---------------------------------------------------------------------===//
754 
756  assert(VT.isFloatingPoint());
757 
758  // Packed operations do not have a fabs modifier.
759  return VT == MVT::f32 || VT == MVT::f64 ||
760  (Subtarget->has16BitInsts() && VT == MVT::f16);
761 }
762 
764  assert(VT.isFloatingPoint());
765  return VT == MVT::f32 || VT == MVT::f64 ||
766  (Subtarget->has16BitInsts() && VT == MVT::f16) ||
767  (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
768 }
769 
771  unsigned NumElem,
772  unsigned AS) const {
773  return true;
774 }
775 
777  // There are few operations which truly have vector input operands. Any vector
778  // operation is going to involve operations on each component, and a
779  // build_vector will be a copy per element, so it always makes sense to use a
780  // build_vector input in place of the extracted element to avoid a copy into a
781  // super register.
782  //
783  // We should probably only do this if all users are extracts only, but this
784  // should be the common case.
785  return true;
786 }
787 
789  // Truncate is just accessing a subregister.
790 
791  unsigned SrcSize = Source.getSizeInBits();
792  unsigned DestSize = Dest.getSizeInBits();
793 
794  return DestSize < SrcSize && DestSize % 32 == 0 ;
795 }
796 
798  // Truncate is just accessing a subregister.
799 
800  unsigned SrcSize = Source->getScalarSizeInBits();
801  unsigned DestSize = Dest->getScalarSizeInBits();
802 
803  if (DestSize== 16 && Subtarget->has16BitInsts())
804  return SrcSize >= 32;
805 
806  return DestSize < SrcSize && DestSize % 32 == 0;
807 }
808 
809 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
810  unsigned SrcSize = Src->getScalarSizeInBits();
811  unsigned DestSize = Dest->getScalarSizeInBits();
812 
813  if (SrcSize == 16 && Subtarget->has16BitInsts())
814  return DestSize >= 32;
815 
816  return SrcSize == 32 && DestSize == 64;
817 }
818 
819 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
820  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
821  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
822  // this will enable reducing 64-bit operations the 32-bit, which is always
823  // good.
824 
825  if (Src == MVT::i16)
826  return Dest == MVT::i32 ||Dest == MVT::i64 ;
827 
828  return Src == MVT::i32 && Dest == MVT::i64;
829 }
830 
832  return isZExtFree(Val.getValueType(), VT2);
833 }
834 
835 // v_mad_mix* support a conversion from f16 to f32.
836 //
837 // There is only one special case when denormals are enabled we don't currently,
838 // where this is OK to use.
840  EVT DestVT, EVT SrcVT) const {
841  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
842  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
843  SrcVT.getScalarType() == MVT::f16;
844 }
845 
847  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
848  // limited number of native 64-bit operations. Shrinking an operation to fit
849  // in a single 32-bit register should always be helpful. As currently used,
850  // this is much less general than the name suggests, and is only used in
851  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
852  // not profitable, and may actually be harmful.
853  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
854 }
855 
856 //===---------------------------------------------------------------------===//
857 // TargetLowering Callbacks
858 //===---------------------------------------------------------------------===//
859 
861  bool IsVarArg) {
862  switch (CC) {
865  return CC_AMDGPU_Kernel;
873  return CC_AMDGPU;
874  case CallingConv::C:
875  case CallingConv::Fast:
876  case CallingConv::Cold:
877  return CC_AMDGPU_Func;
878  default:
879  report_fatal_error("Unsupported calling convention.");
880  }
881 }
882 
884  bool IsVarArg) {
885  switch (CC) {
888  return CC_AMDGPU_Kernel;
896  return RetCC_SI_Shader;
897  case CallingConv::C:
898  case CallingConv::Fast:
899  case CallingConv::Cold:
900  return RetCC_AMDGPU_Func;
901  default:
902  report_fatal_error("Unsupported calling convention.");
903  }
904 }
905 
906 /// The SelectionDAGBuilder will automatically promote function arguments
907 /// with illegal types. However, this does not work for the AMDGPU targets
908 /// since the function arguments are stored in memory as these illegal types.
909 /// In order to handle this properly we need to get the original types sizes
910 /// from the LLVM IR Function and fixup the ISD:InputArg values before
911 /// passing them to AnalyzeFormalArguments()
912 
913 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
914 /// input values across multiple registers. Each item in the Ins array
915 /// represents a single value that will be stored in registers. Ins[x].VT is
916 /// the value type of the value that will be stored in the register, so
917 /// whatever SDNode we lower the argument to needs to be this type.
918 ///
919 /// In order to correctly lower the arguments we need to know the size of each
920 /// argument. Since Ins[x].VT gives us the size of the register that will
921 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
922 /// for the orignal function argument so that we can deduce the correct memory
923 /// type to use for Ins[x]. In most cases the correct memory type will be
924 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
925 /// we have a kernel argument of type v8i8, this argument will be split into
926 /// 8 parts and each part will be represented by its own item in the Ins array.
927 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
928 /// the argument before it was split. From this, we deduce that the memory type
929 /// for each individual part is i8. We pass the memory type as LocVT to the
930 /// calling convention analysis function and the register type (Ins[x].VT) as
931 /// the ValVT.
933  const SmallVectorImpl<ISD::InputArg> &Ins) const {
934  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
935  const ISD::InputArg &In = Ins[i];
936  EVT MemVT;
937 
938  unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
939 
940  if (!Subtarget->isAmdHsaOS() &&
941  (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
942  // The ABI says the caller will extend these values to 32-bits.
943  MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
944  } else if (NumRegs == 1) {
945  // This argument is not split, so the IR type is the memory type.
946  assert(!In.Flags.isSplit());
947  if (In.ArgVT.isExtended()) {
948  // We have an extended type, like i24, so we should just use the register type
949  MemVT = In.VT;
950  } else {
951  MemVT = In.ArgVT;
952  }
953  } else if (In.ArgVT.isVector() && In.VT.isVector() &&
954  In.ArgVT.getScalarType() == In.VT.getScalarType()) {
956  // We have a vector value which has been split into a vector with
957  // the same scalar type, but fewer elements. This should handle
958  // all the floating-point vector types.
959  MemVT = In.VT;
960  } else if (In.ArgVT.isVector() &&
961  In.ArgVT.getVectorNumElements() == NumRegs) {
962  // This arg has been split so that each element is stored in a separate
963  // register.
964  MemVT = In.ArgVT.getScalarType();
965  } else if (In.ArgVT.isExtended()) {
966  // We have an extended type, like i65.
967  MemVT = In.VT;
968  } else {
969  unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
970  assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
971  if (In.VT.isInteger()) {
972  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
973  } else if (In.VT.isVector()) {
975  unsigned NumElements = In.VT.getVectorNumElements();
976  assert(MemoryBits % NumElements == 0);
977  // This vector type has been split into another vector type with
978  // a different elements size.
979  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
980  MemoryBits / NumElements);
981  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
982  } else {
983  llvm_unreachable("cannot deduce memory type.");
984  }
985  }
986 
987  // Convert one element vectors to scalar.
988  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
989  MemVT = MemVT.getScalarType();
990 
991  if (MemVT.isExtended()) {
992  // This should really only happen if we have vec3 arguments
993  assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
994  MemVT = MemVT.getPow2VectorType(State.getContext());
995  }
996 
997  assert(MemVT.isSimple());
999  State);
1000  }
1001 }
1002 
1004  SDValue Chain, CallingConv::ID CallConv,
1005  bool isVarArg,
1006  const SmallVectorImpl<ISD::OutputArg> &Outs,
1007  const SmallVectorImpl<SDValue> &OutVals,
1008  const SDLoc &DL, SelectionDAG &DAG) const {
1009  // FIXME: Fails for r600 tests
1010  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1011  // "wave terminate should not have return values");
1012  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1013 }
1014 
1015 //===---------------------------------------------------------------------===//
1016 // Target specific lowering
1017 //===---------------------------------------------------------------------===//
1018 
1019 /// Selects the correct CCAssignFn for a given CallingConvention value.
1021  bool IsVarArg) {
1022  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1023 }
1024 
1026  bool IsVarArg) {
1027  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1028 }
1029 
1031  SelectionDAG &DAG,
1032  MachineFrameInfo &MFI,
1033  int ClobberedFI) const {
1034  SmallVector<SDValue, 8> ArgChains;
1035  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1036  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1037 
1038  // Include the original chain at the beginning of the list. When this is
1039  // used by target LowerCall hooks, this helps legalize find the
1040  // CALLSEQ_BEGIN node.
1041  ArgChains.push_back(Chain);
1042 
1043  // Add a chain value for each stack argument corresponding
1044  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1045  UE = DAG.getEntryNode().getNode()->use_end();
1046  U != UE; ++U) {
1047  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1048  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1049  if (FI->getIndex() < 0) {
1050  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1051  int64_t InLastByte = InFirstByte;
1052  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1053 
1054  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1055  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1056  ArgChains.push_back(SDValue(L, 1));
1057  }
1058  }
1059  }
1060  }
1061 
1062  // Build a tokenfactor for all the chains.
1063  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1064 }
1065 
1067  SmallVectorImpl<SDValue> &InVals,
1068  StringRef Reason) const {
1069  SDValue Callee = CLI.Callee;
1070  SelectionDAG &DAG = CLI.DAG;
1071 
1072  const Function &Fn = DAG.getMachineFunction().getFunction();
1073 
1074  StringRef FuncName("<unknown>");
1075 
1076  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1077  FuncName = G->getSymbol();
1078  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1079  FuncName = G->getGlobal()->getName();
1080 
1081  DiagnosticInfoUnsupported NoCalls(
1082  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1083  DAG.getContext()->diagnose(NoCalls);
1084 
1085  if (!CLI.IsTailCall) {
1086  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1087  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1088  }
1089 
1090  return DAG.getEntryNode();
1091 }
1092 
1094  SmallVectorImpl<SDValue> &InVals) const {
1095  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1096 }
1097 
1099  SelectionDAG &DAG) const {
1100  const Function &Fn = DAG.getMachineFunction().getFunction();
1101 
1102  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1103  SDLoc(Op).getDebugLoc());
1104  DAG.getContext()->diagnose(NoDynamicAlloca);
1105  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1106  return DAG.getMergeValues(Ops, SDLoc());
1107 }
1108 
1110  SelectionDAG &DAG) const {
1111  switch (Op.getOpcode()) {
1112  default:
1113  Op->print(errs(), &DAG);
1114  llvm_unreachable("Custom lowering code for this"
1115  "instruction is not implemented yet!");
1116  break;
1117  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1118  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1119  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1120  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1121  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1122  case ISD::FREM: return LowerFREM(Op, DAG);
1123  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1124  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1125  case ISD::FRINT: return LowerFRINT(Op, DAG);
1126  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1127  case ISD::FROUND: return LowerFROUND(Op, DAG);
1128  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1129  case ISD::FLOG:
1130  return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
1131  case ISD::FLOG10:
1132  return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
1133  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1134  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1135  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1136  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1137  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1138  case ISD::CTTZ:
1139  case ISD::CTTZ_ZERO_UNDEF:
1140  case ISD::CTLZ:
1141  case ISD::CTLZ_ZERO_UNDEF:
1142  return LowerCTLZ_CTTZ(Op, DAG);
1143  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1144  }
1145  return Op;
1146 }
1147 
1150  SelectionDAG &DAG) const {
1151  switch (N->getOpcode()) {
1153  // Different parts of legalization seem to interpret which type of
1154  // sign_extend_inreg is the one to check for custom lowering. The extended
1155  // from type is what really matters, but some places check for custom
1156  // lowering of the result type. This results in trying to use
1157  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1158  // nothing here and let the illegal result integer be handled normally.
1159  return;
1160  default:
1161  return;
1162  }
1163 }
1164 
1165 static bool hasDefinedInitializer(const GlobalValue *GV) {
1166  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1167  if (!GVar || !GVar->hasInitializer())
1168  return false;
1169 
1170  return !isa<UndefValue>(GVar->getInitializer());
1171 }
1172 
1174  SDValue Op,
1175  SelectionDAG &DAG) const {
1176 
1177  const DataLayout &DL = DAG.getDataLayout();
1178  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1179  const GlobalValue *GV = G->getGlobal();
1180 
1181  if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
1182  // XXX: What does the value of G->getOffset() mean?
1183  assert(G->getOffset() == 0 &&
1184  "Do not know what to do with an non-zero offset");
1185 
1186  // TODO: We could emit code to handle the initialization somewhere.
1187  if (!hasDefinedInitializer(GV)) {
1188  unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1189  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1190  }
1191  }
1192 
1193  const Function &Fn = DAG.getMachineFunction().getFunction();
1194  DiagnosticInfoUnsupported BadInit(
1195  Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1196  DAG.getContext()->diagnose(BadInit);
1197  return SDValue();
1198 }
1199 
1201  SelectionDAG &DAG) const {
1203 
1204  for (const SDUse &U : Op->ops())
1205  DAG.ExtractVectorElements(U.get(), Args);
1206 
1207  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1208 }
1209 
1211  SelectionDAG &DAG) const {
1212 
1214  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1215  EVT VT = Op.getValueType();
1216  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1217  VT.getVectorNumElements());
1218 
1219  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1220 }
1221 
1222 /// \brief Generate Min/Max node
1224  SDValue LHS, SDValue RHS,
1225  SDValue True, SDValue False,
1226  SDValue CC,
1227  DAGCombinerInfo &DCI) const {
1228  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1229  return SDValue();
1230 
1231  SelectionDAG &DAG = DCI.DAG;
1232  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1233  switch (CCOpcode) {
1234  case ISD::SETOEQ:
1235  case ISD::SETONE:
1236  case ISD::SETUNE:
1237  case ISD::SETNE:
1238  case ISD::SETUEQ:
1239  case ISD::SETEQ:
1240  case ISD::SETFALSE:
1241  case ISD::SETFALSE2:
1242  case ISD::SETTRUE:
1243  case ISD::SETTRUE2:
1244  case ISD::SETUO:
1245  case ISD::SETO:
1246  break;
1247  case ISD::SETULE:
1248  case ISD::SETULT: {
1249  if (LHS == True)
1250  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1251  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1252  }
1253  case ISD::SETOLE:
1254  case ISD::SETOLT:
1255  case ISD::SETLE:
1256  case ISD::SETLT: {
1257  // Ordered. Assume ordered for undefined.
1258 
1259  // Only do this after legalization to avoid interfering with other combines
1260  // which might occur.
1261  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1262  !DCI.isCalledByLegalizer())
1263  return SDValue();
1264 
1265  // We need to permute the operands to get the correct NaN behavior. The
1266  // selected operand is the second one based on the failing compare with NaN,
1267  // so permute it based on the compare type the hardware uses.
1268  if (LHS == True)
1269  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1270  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1271  }
1272  case ISD::SETUGE:
1273  case ISD::SETUGT: {
1274  if (LHS == True)
1275  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1276  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1277  }
1278  case ISD::SETGT:
1279  case ISD::SETGE:
1280  case ISD::SETOGE:
1281  case ISD::SETOGT: {
1282  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1283  !DCI.isCalledByLegalizer())
1284  return SDValue();
1285 
1286  if (LHS == True)
1287  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1288  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1289  }
1290  case ISD::SETCC_INVALID:
1291  llvm_unreachable("Invalid setcc condcode!");
1292  }
1293  return SDValue();
1294 }
1295 
1296 std::pair<SDValue, SDValue>
1298  SDLoc SL(Op);
1299 
1300  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1301 
1302  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1303  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1304 
1305  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1306  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1307 
1308  return std::make_pair(Lo, Hi);
1309 }
1310 
1312  SDLoc SL(Op);
1313 
1314  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1315  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1316  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1317 }
1318 
1320  SDLoc SL(Op);
1321 
1322  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1323  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1324  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1325 }
1326 
1328  SelectionDAG &DAG) const {
1329  LoadSDNode *Load = cast<LoadSDNode>(Op);
1330  EVT VT = Op.getValueType();
1331 
1332 
1333  // If this is a 2 element vector, we really want to scalarize and not create
1334  // weird 1 element vectors.
1335  if (VT.getVectorNumElements() == 2)
1336  return scalarizeVectorLoad(Load, DAG);
1337 
1338  SDValue BasePtr = Load->getBasePtr();
1339  EVT MemVT = Load->getMemoryVT();
1340  SDLoc SL(Op);
1341 
1342  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1343 
1344  EVT LoVT, HiVT;
1345  EVT LoMemVT, HiMemVT;
1346  SDValue Lo, Hi;
1347 
1348  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1349  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1350  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1351 
1352  unsigned Size = LoMemVT.getStoreSize();
1353  unsigned BaseAlign = Load->getAlignment();
1354  unsigned HiAlign = MinAlign(BaseAlign, Size);
1355 
1356  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1357  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1358  BaseAlign, Load->getMemOperand()->getFlags());
1359  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1360  SDValue HiLoad =
1361  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1362  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1363  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1364 
1365  SDValue Ops[] = {
1366  DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1368  LoLoad.getValue(1), HiLoad.getValue(1))
1369  };
1370 
1371  return DAG.getMergeValues(Ops, SL);
1372 }
1373 
1375  SelectionDAG &DAG) const {
1376  StoreSDNode *Store = cast<StoreSDNode>(Op);
1377  SDValue Val = Store->getValue();
1378  EVT VT = Val.getValueType();
1379 
1380  // If this is a 2 element vector, we really want to scalarize and not create
1381  // weird 1 element vectors.
1382  if (VT.getVectorNumElements() == 2)
1383  return scalarizeVectorStore(Store, DAG);
1384 
1385  EVT MemVT = Store->getMemoryVT();
1386  SDValue Chain = Store->getChain();
1387  SDValue BasePtr = Store->getBasePtr();
1388  SDLoc SL(Op);
1389 
1390  EVT LoVT, HiVT;
1391  EVT LoMemVT, HiMemVT;
1392  SDValue Lo, Hi;
1393 
1394  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1395  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1396  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1397 
1398  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1399 
1400  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1401  unsigned BaseAlign = Store->getAlignment();
1402  unsigned Size = LoMemVT.getStoreSize();
1403  unsigned HiAlign = MinAlign(BaseAlign, Size);
1404 
1405  SDValue LoStore =
1406  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1407  Store->getMemOperand()->getFlags());
1408  SDValue HiStore =
1409  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1410  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1411 
1412  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1413 }
1414 
1415 // This is a shortcut for integer division because we have fast i32<->f32
1416 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1417 // float is enough to accurately represent up to a 24-bit signed integer.
1419  bool Sign) const {
1420  SDLoc DL(Op);
1421  EVT VT = Op.getValueType();
1422  SDValue LHS = Op.getOperand(0);
1423  SDValue RHS = Op.getOperand(1);
1424  MVT IntVT = MVT::i32;
1425  MVT FltVT = MVT::f32;
1426 
1427  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1428  if (LHSSignBits < 9)
1429  return SDValue();
1430 
1431  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1432  if (RHSSignBits < 9)
1433  return SDValue();
1434 
1435  unsigned BitSize = VT.getSizeInBits();
1436  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1437  unsigned DivBits = BitSize - SignBits;
1438  if (Sign)
1439  ++DivBits;
1440 
1443 
1444  SDValue jq = DAG.getConstant(1, DL, IntVT);
1445 
1446  if (Sign) {
1447  // char|short jq = ia ^ ib;
1448  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1449 
1450  // jq = jq >> (bitsize - 2)
1451  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1452  DAG.getConstant(BitSize - 2, DL, VT));
1453 
1454  // jq = jq | 0x1
1455  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1456  }
1457 
1458  // int ia = (int)LHS;
1459  SDValue ia = LHS;
1460 
1461  // int ib, (int)RHS;
1462  SDValue ib = RHS;
1463 
1464  // float fa = (float)ia;
1465  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1466 
1467  // float fb = (float)ib;
1468  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1469 
1470  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1471  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1472 
1473  // fq = trunc(fq);
1474  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1475 
1476  // float fqneg = -fq;
1477  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1478 
1479  // float fr = mad(fqneg, fb, fa);
1480  unsigned OpCode = Subtarget->hasFP32Denormals() ?
1482  (unsigned)ISD::FMAD;
1483  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1484 
1485  // int iq = (int)fq;
1486  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1487 
1488  // fr = fabs(fr);
1489  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1490 
1491  // fb = fabs(fb);
1492  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1493 
1494  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1495 
1496  // int cv = fr >= fb;
1497  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1498 
1499  // jq = (cv ? jq : 0);
1500  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1501 
1502  // dst = iq + jq;
1503  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1504 
1505  // Rem needs compensation, it's easier to recompute it
1506  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1507  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1508 
1509  // Truncate to number of bits this divide really is.
1510  if (Sign) {
1511  SDValue InRegSize
1512  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1513  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1514  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1515  } else {
1516  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1517  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1518  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1519  }
1520 
1521  return DAG.getMergeValues({ Div, Rem }, DL);
1522 }
1523 
1525  SelectionDAG &DAG,
1527  SDLoc DL(Op);
1528  EVT VT = Op.getValueType();
1529 
1530  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1531 
1532  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1533 
1534  SDValue One = DAG.getConstant(1, DL, HalfVT);
1535  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1536 
1537  //HiLo split
1538  SDValue LHS = Op.getOperand(0);
1539  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1540  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1541 
1542  SDValue RHS = Op.getOperand(1);
1543  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1544  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1545 
1546  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1547  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1548 
1549  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1550  LHS_Lo, RHS_Lo);
1551 
1552  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1553  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1554 
1555  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1556  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1557  return;
1558  }
1559 
1560  if (isTypeLegal(MVT::i64)) {
1561  // Compute denominator reciprocal.
1562  unsigned FMAD = Subtarget->hasFP32Denormals() ?
1564  (unsigned)ISD::FMAD;
1565 
1566  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1567  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1568  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1569  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1570  Cvt_Lo);
1571  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1572  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1573  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1574  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1575  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1576  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1577  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1578  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1579  Mul1);
1580  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1581  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1582  SDValue Rcp64 = DAG.getBitcast(VT,
1583  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1584 
1585  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1586  SDValue One64 = DAG.getConstant(1, DL, VT);
1587  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1588  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1589 
1590  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1591  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1592  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1593  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1594  Zero);
1595  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1596  One);
1597 
1598  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1599  Mulhi1_Lo, Zero1);
1600  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1601  Mulhi1_Hi, Add1_Lo.getValue(1));
1602  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1603  SDValue Add1 = DAG.getBitcast(VT,
1604  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1605 
1606  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1607  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1608  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1609  Zero);
1610  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1611  One);
1612 
1613  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1614  Mulhi2_Lo, Zero1);
1615  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1616  Mulhi2_Hi, Add1_Lo.getValue(1));
1617  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1618  Zero, Add2_Lo.getValue(1));
1619  SDValue Add2 = DAG.getBitcast(VT,
1620  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1621  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1622 
1623  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1624 
1625  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1626  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1627  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1628  Mul3_Lo, Zero1);
1629  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1630  Mul3_Hi, Sub1_Lo.getValue(1));
1631  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1632  SDValue Sub1 = DAG.getBitcast(VT,
1633  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1634 
1635  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1636  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1637  ISD::SETUGE);
1638  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1639  ISD::SETUGE);
1640  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1641 
1642  // TODO: Here and below portions of the code can be enclosed into if/endif.
1643  // Currently control flow is unconditional and we have 4 selects after
1644  // potential endif to substitute PHIs.
1645 
1646  // if C3 != 0 ...
1647  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1648  RHS_Lo, Zero1);
1649  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1650  RHS_Hi, Sub1_Lo.getValue(1));
1651  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1652  Zero, Sub2_Lo.getValue(1));
1653  SDValue Sub2 = DAG.getBitcast(VT,
1654  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1655 
1656  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1657 
1658  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1659  ISD::SETUGE);
1660  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1661  ISD::SETUGE);
1662  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1663 
1664  // if (C6 != 0)
1665  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1666 
1667  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1668  RHS_Lo, Zero1);
1669  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1670  RHS_Hi, Sub2_Lo.getValue(1));
1671  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1672  Zero, Sub3_Lo.getValue(1));
1673  SDValue Sub3 = DAG.getBitcast(VT,
1674  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1675 
1676  // endif C6
1677  // endif C3
1678 
1679  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1680  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1681 
1682  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1683  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1684 
1685  Results.push_back(Div);
1686  Results.push_back(Rem);
1687 
1688  return;
1689  }
1690 
1691  // r600 expandion.
1692  // Get Speculative values
1693  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1694  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1695 
1696  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1697  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1698  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1699 
1700  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1701  SDValue DIV_Lo = Zero;
1702 
1703  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1704 
1705  for (unsigned i = 0; i < halfBitWidth; ++i) {
1706  const unsigned bitPos = halfBitWidth - i - 1;
1707  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1708  // Get value of high bit
1709  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1710  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1711  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1712 
1713  // Shift
1714  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1715  // Add LHS high bit
1716  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1717 
1718  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1719  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1720 
1721  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1722 
1723  // Update REM
1724  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1725  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1726  }
1727 
1728  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1729  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1730  Results.push_back(DIV);
1731  Results.push_back(REM);
1732 }
1733 
1735  SelectionDAG &DAG) const {
1736  SDLoc DL(Op);
1737  EVT VT = Op.getValueType();
1738 
1739  if (VT == MVT::i64) {
1741  LowerUDIVREM64(Op, DAG, Results);
1742  return DAG.getMergeValues(Results, DL);
1743  }
1744 
1745  if (VT == MVT::i32) {
1746  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1747  return Res;
1748  }
1749 
1750  SDValue Num = Op.getOperand(0);
1751  SDValue Den = Op.getOperand(1);
1752 
1753  // RCP = URECIP(Den) = 2^32 / Den + e
1754  // e is rounding error.
1755  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1756 
1757  // RCP_LO = mul(RCP, Den) */
1758  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1759 
1760  // RCP_HI = mulhu (RCP, Den) */
1761  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1762 
1763  // NEG_RCP_LO = -RCP_LO
1764  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1765  RCP_LO);
1766 
1767  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1768  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1769  NEG_RCP_LO, RCP_LO,
1770  ISD::SETEQ);
1771  // Calculate the rounding error from the URECIP instruction
1772  // E = mulhu(ABS_RCP_LO, RCP)
1773  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1774 
1775  // RCP_A_E = RCP + E
1776  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1777 
1778  // RCP_S_E = RCP - E
1779  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1780 
1781  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1782  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1783  RCP_A_E, RCP_S_E,
1784  ISD::SETEQ);
1785  // Quotient = mulhu(Tmp0, Num)
1786  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1787 
1788  // Num_S_Remainder = Quotient * Den
1789  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1790 
1791  // Remainder = Num - Num_S_Remainder
1792  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1793 
1794  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1795  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1796  DAG.getConstant(-1, DL, VT),
1797  DAG.getConstant(0, DL, VT),
1798  ISD::SETUGE);
1799  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1800  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1801  Num_S_Remainder,
1802  DAG.getConstant(-1, DL, VT),
1803  DAG.getConstant(0, DL, VT),
1804  ISD::SETUGE);
1805  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1806  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1807  Remainder_GE_Zero);
1808 
1809  // Calculate Division result:
1810 
1811  // Quotient_A_One = Quotient + 1
1812  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1813  DAG.getConstant(1, DL, VT));
1814 
1815  // Quotient_S_One = Quotient - 1
1816  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1817  DAG.getConstant(1, DL, VT));
1818 
1819  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1820  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1821  Quotient, Quotient_A_One, ISD::SETEQ);
1822 
1823  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1824  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1825  Quotient_S_One, Div, ISD::SETEQ);
1826 
1827  // Calculate Rem result:
1828 
1829  // Remainder_S_Den = Remainder - Den
1830  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1831 
1832  // Remainder_A_Den = Remainder + Den
1833  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1834 
1835  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1836  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1837  Remainder, Remainder_S_Den, ISD::SETEQ);
1838 
1839  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1840  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1841  Remainder_A_Den, Rem, ISD::SETEQ);
1842  SDValue Ops[2] = {
1843  Div,
1844  Rem
1845  };
1846  return DAG.getMergeValues(Ops, DL);
1847 }
1848 
1850  SelectionDAG &DAG) const {
1851  SDLoc DL(Op);
1852  EVT VT = Op.getValueType();
1853 
1854  SDValue LHS = Op.getOperand(0);
1855  SDValue RHS = Op.getOperand(1);
1856 
1857  SDValue Zero = DAG.getConstant(0, DL, VT);
1858  SDValue NegOne = DAG.getConstant(-1, DL, VT);
1859 
1860  if (VT == MVT::i32) {
1861  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1862  return Res;
1863  }
1864 
1865  if (VT == MVT::i64 &&
1866  DAG.ComputeNumSignBits(LHS) > 32 &&
1867  DAG.ComputeNumSignBits(RHS) > 32) {
1868  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1869 
1870  //HiLo split
1871  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1872  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1873  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1874  LHS_Lo, RHS_Lo);
1875  SDValue Res[2] = {
1876  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1877  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1878  };
1879  return DAG.getMergeValues(Res, DL);
1880  }
1881 
1882  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1883  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1884  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1885  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1886 
1887  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1888  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1889 
1890  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1891  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1892 
1893  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1894  SDValue Rem = Div.getValue(1);
1895 
1896  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1897  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1898 
1899  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1900  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1901 
1902  SDValue Res[2] = {
1903  Div,
1904  Rem
1905  };
1906  return DAG.getMergeValues(Res, DL);
1907 }
1908 
1909 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1911  SDLoc SL(Op);
1912  EVT VT = Op.getValueType();
1913  SDValue X = Op.getOperand(0);
1914  SDValue Y = Op.getOperand(1);
1915 
1916  // TODO: Should this propagate fast-math-flags?
1917 
1918  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1919  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1920  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1921 
1922  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1923 }
1924 
1926  SDLoc SL(Op);
1927  SDValue Src = Op.getOperand(0);
1928 
1929  // result = trunc(src)
1930  // if (src > 0.0 && src != result)
1931  // result += 1.0
1932 
1933  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1934 
1935  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1936  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1937 
1938  EVT SetCCVT =
1940 
1941  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1942  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1943  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1944 
1945  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1946  // TODO: Should this propagate fast-math-flags?
1947  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1948 }
1949 
1951  SelectionDAG &DAG) {
1952  const unsigned FractBits = 52;
1953  const unsigned ExpBits = 11;
1954 
1955  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1956  Hi,
1957  DAG.getConstant(FractBits - 32, SL, MVT::i32),
1958  DAG.getConstant(ExpBits, SL, MVT::i32));
1959  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1960  DAG.getConstant(1023, SL, MVT::i32));
1961 
1962  return Exp;
1963 }
1964 
1966  SDLoc SL(Op);
1967  SDValue Src = Op.getOperand(0);
1968 
1969  assert(Op.getValueType() == MVT::f64);
1970 
1971  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1972  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1973 
1974  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1975 
1976  // Extract the upper half, since this is where we will find the sign and
1977  // exponent.
1978  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1979 
1980  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1981 
1982  const unsigned FractBits = 52;
1983 
1984  // Extract the sign bit.
1985  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1986  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1987 
1988  // Extend back to 64-bits.
1989  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1990  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1991 
1992  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1993  const SDValue FractMask
1994  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1995 
1996  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1997  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1998  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1999 
2000  EVT SetCCVT =
2002 
2003  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2004 
2005  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2006  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2007 
2008  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2009  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2010 
2011  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2012 }
2013 
2015  SDLoc SL(Op);
2016  SDValue Src = Op.getOperand(0);
2017 
2018  assert(Op.getValueType() == MVT::f64);
2019 
2020  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2021  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2022  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2023 
2024  // TODO: Should this propagate fast-math-flags?
2025 
2026  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2027  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2028 
2029  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2030 
2031  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2032  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2033 
2034  EVT SetCCVT =
2036  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2037 
2038  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2039 }
2040 
2042  // FNEARBYINT and FRINT are the same, except in their handling of FP
2043  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2044  // rint, so just treat them as equivalent.
2045  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2046 }
2047 
2048 // XXX - May require not supporting f32 denormals?
2049 
2050 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2051 // compare and vselect end up producing worse code than scalarizing the whole
2052 // operation.
2054  SDLoc SL(Op);
2055  SDValue X = Op.getOperand(0);
2056  EVT VT = Op.getValueType();
2057 
2058  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2059 
2060  // TODO: Should this propagate fast-math-flags?
2061 
2062  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2063 
2064  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2065 
2066  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2067  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2068  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2069 
2070  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2071 
2072  EVT SetCCVT =
2073  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2074 
2075  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2076 
2077  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2078 
2079  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2080 }
2081 
2083  SDLoc SL(Op);
2084  SDValue X = Op.getOperand(0);
2085 
2086  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2087 
2088  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2089  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2090  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2091  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2092  EVT SetCCVT =
2094 
2095  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2096 
2097  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2098 
2099  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2100 
2101  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
2102  MVT::i64);
2103 
2104  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2105  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2106  DAG.getConstant(INT64_C(0x0008000000000000), SL,
2107  MVT::i64),
2108  Exp);
2109 
2110  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2111  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2112  DAG.getConstant(0, SL, MVT::i64), Tmp0,
2113  ISD::SETNE);
2114 
2115  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2116  D, DAG.getConstant(0, SL, MVT::i64));
2117  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2118 
2119  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2120  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2121 
2122  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2123  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2124  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2125 
2126  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2127  ExpEqNegOne,
2128  DAG.getConstantFP(1.0, SL, MVT::f64),
2129  DAG.getConstantFP(0.0, SL, MVT::f64));
2130 
2131  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2132 
2133  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2134  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2135 
2136  return K;
2137 }
2138 
2140  EVT VT = Op.getValueType();
2141 
2142  if (VT == MVT::f32 || VT == MVT::f16)
2143  return LowerFROUND32_16(Op, DAG);
2144 
2145  if (VT == MVT::f64)
2146  return LowerFROUND64(Op, DAG);
2147 
2148  llvm_unreachable("unhandled type");
2149 }
2150 
2152  SDLoc SL(Op);
2153  SDValue Src = Op.getOperand(0);
2154 
2155  // result = trunc(src);
2156  // if (src < 0.0 && src != result)
2157  // result += -1.0.
2158 
2159  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2160 
2161  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2162  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2163 
2164  EVT SetCCVT =
2166 
2167  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2168  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2169  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2170 
2171  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2172  // TODO: Should this propagate fast-math-flags?
2173  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2174 }
2175 
2177  double Log2BaseInverted) const {
2178  EVT VT = Op.getValueType();
2179 
2180  SDLoc SL(Op);
2181  SDValue Operand = Op.getOperand(0);
2182  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2183  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2184 
2185  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2186 }
2187 
2188 static bool isCtlzOpc(unsigned Opc) {
2189  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2190 }
2191 
2192 static bool isCttzOpc(unsigned Opc) {
2193  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2194 }
2195 
2197  SDLoc SL(Op);
2198  SDValue Src = Op.getOperand(0);
2199  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2201 
2202  unsigned ISDOpc, NewOpc;
2203  if (isCtlzOpc(Op.getOpcode())) {
2204  ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2205  NewOpc = AMDGPUISD::FFBH_U32;
2206  } else if (isCttzOpc(Op.getOpcode())) {
2207  ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2208  NewOpc = AMDGPUISD::FFBL_B32;
2209  } else
2210  llvm_unreachable("Unexpected OPCode!!!");
2211 
2212 
2213  if (ZeroUndef && Src.getValueType() == MVT::i32)
2214  return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2215 
2216  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2217 
2218  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2219  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2220 
2221  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2222  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2223 
2224  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2225  *DAG.getContext(), MVT::i32);
2226 
2227  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2228  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2229 
2230  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2231  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2232 
2233  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2234  SDValue Add, NewOpr;
2235  if (isCtlzOpc(Op.getOpcode())) {
2236  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2237  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2238  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2239  } else {
2240  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2241  // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2242  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2243  }
2244 
2245  if (!ZeroUndef) {
2246  // Test if the full 64-bit input is zero.
2247 
2248  // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2249  // which we probably don't want.
2250  SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2251  SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2252  SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2253 
2254  // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2255  // with the same cycles, otherwise it is slower.
2256  // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2257  // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2258 
2259  const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2260 
2261  // The instruction returns -1 for 0 input, but the defined intrinsic
2262  // behavior is to return the number of bits.
2263  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2264  SrcIsZero, Bits32, NewOpr);
2265  }
2266 
2267  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2268 }
2269 
2271  bool Signed) const {
2272  // Unsigned
2273  // cul2f(ulong u)
2274  //{
2275  // uint lz = clz(u);
2276  // uint e = (u != 0) ? 127U + 63U - lz : 0;
2277  // u = (u << lz) & 0x7fffffffffffffffUL;
2278  // ulong t = u & 0xffffffffffUL;
2279  // uint v = (e << 23) | (uint)(u >> 40);
2280  // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2281  // return as_float(v + r);
2282  //}
2283  // Signed
2284  // cl2f(long l)
2285  //{
2286  // long s = l >> 63;
2287  // float r = cul2f((l + s) ^ s);
2288  // return s ? -r : r;
2289  //}
2290 
2291  SDLoc SL(Op);
2292  SDValue Src = Op.getOperand(0);
2293  SDValue L = Src;
2294 
2295  SDValue S;
2296  if (Signed) {
2297  const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2298  S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2299 
2300  SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2301  L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2302  }
2303 
2304  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2305  *DAG.getContext(), MVT::f32);
2306 
2307 
2308  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2309  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2310  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2311  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2312 
2313  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2314  SDValue E = DAG.getSelect(SL, MVT::i32,
2315  DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2316  DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2317  ZeroI32);
2318 
2319  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2320  DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2321  DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2322 
2323  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2324  DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2325 
2326  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2327  U, DAG.getConstant(40, SL, MVT::i64));
2328 
2329  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2330  DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2331  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2332 
2333  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2334  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2335  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2336 
2337  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2338 
2339  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2340 
2341  SDValue R = DAG.getSelect(SL, MVT::i32,
2342  RCmp,
2343  One,
2344  DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2345  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2346  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2347 
2348  if (!Signed)
2349  return R;
2350 
2351  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2352  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2353 }
2354 
2356  bool Signed) const {
2357  SDLoc SL(Op);
2358  SDValue Src = Op.getOperand(0);
2359 
2360  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2361 
2363  DAG.getConstant(0, SL, MVT::i32));
2365  DAG.getConstant(1, SL, MVT::i32));
2366 
2367  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2368  SL, MVT::f64, Hi);
2369 
2370  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2371 
2372  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2373  DAG.getConstant(32, SL, MVT::i32));
2374  // TODO: Should this propagate fast-math-flags?
2375  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2376 }
2377 
2379  SelectionDAG &DAG) const {
2380  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2381  "operation should be legal");
2382 
2383  // TODO: Factor out code common with LowerSINT_TO_FP.
2384 
2385  EVT DestVT = Op.getValueType();
2386  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2387  SDLoc DL(Op);
2388  SDValue Src = Op.getOperand(0);
2389 
2390  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2391  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2392  SDValue FPRound =
2393  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2394 
2395  return FPRound;
2396  }
2397 
2398  if (DestVT == MVT::f32)
2399  return LowerINT_TO_FP32(Op, DAG, false);
2400 
2401  assert(DestVT == MVT::f64);
2402  return LowerINT_TO_FP64(Op, DAG, false);
2403 }
2404 
2406  SelectionDAG &DAG) const {
2407  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2408  "operation should be legal");
2409 
2410  // TODO: Factor out code common with LowerUINT_TO_FP.
2411 
2412  EVT DestVT = Op.getValueType();
2413  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2414  SDLoc DL(Op);
2415  SDValue Src = Op.getOperand(0);
2416 
2417  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2418  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2419  SDValue FPRound =
2420  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2421 
2422  return FPRound;
2423  }
2424 
2425  if (DestVT == MVT::f32)
2426  return LowerINT_TO_FP32(Op, DAG, true);
2427 
2428  assert(DestVT == MVT::f64);
2429  return LowerINT_TO_FP64(Op, DAG, true);
2430 }
2431 
2433  bool Signed) const {
2434  SDLoc SL(Op);
2435 
2436  SDValue Src = Op.getOperand(0);
2437 
2438  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2439 
2440  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2441  MVT::f64);
2442  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2443  MVT::f64);
2444  // TODO: Should this propagate fast-math-flags?
2445  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2446 
2447  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2448 
2449 
2450  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2451 
2452  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2453  MVT::i32, FloorMul);
2454  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2455 
2456  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2457 
2458  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2459 }
2460 
2462  SDLoc DL(Op);
2463  SDValue N0 = Op.getOperand(0);
2464 
2465  // Convert to target node to get known bits
2466  if (N0.getValueType() == MVT::f32)
2467  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2468 
2470  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2471  return SDValue();
2472  }
2473 
2475 
2476  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2477  const unsigned ExpMask = 0x7ff;
2478  const unsigned ExpBiasf64 = 1023;
2479  const unsigned ExpBiasf16 = 15;
2480  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2481  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2482  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2483  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2484  DAG.getConstant(32, DL, MVT::i64));
2485  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2486  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2487  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2488  DAG.getConstant(20, DL, MVT::i64));
2489  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2490  DAG.getConstant(ExpMask, DL, MVT::i32));
2491  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2492  // add the f16 bias (15) to get the biased exponent for the f16 format.
2493  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2494  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2495 
2496  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2497  DAG.getConstant(8, DL, MVT::i32));
2498  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2499  DAG.getConstant(0xffe, DL, MVT::i32));
2500 
2501  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2502  DAG.getConstant(0x1ff, DL, MVT::i32));
2503  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2504 
2505  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2506  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2507 
2508  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2509  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2510  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2511  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2512 
2513  // N = M | (E << 12);
2514  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2515  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2516  DAG.getConstant(12, DL, MVT::i32)));
2517 
2518  // B = clamp(1-E, 0, 13);
2519  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2520  One, E);
2521  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2522  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2523  DAG.getConstant(13, DL, MVT::i32));
2524 
2525  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2526  DAG.getConstant(0x1000, DL, MVT::i32));
2527 
2528  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2529  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2530  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2531  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2532 
2533  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2534  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2535  DAG.getConstant(0x7, DL, MVT::i32));
2536  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2537  DAG.getConstant(2, DL, MVT::i32));
2538  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2539  One, Zero, ISD::SETEQ);
2540  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2541  One, Zero, ISD::SETGT);
2542  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2543  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2544 
2545  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2546  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2547  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2548  I, V, ISD::SETEQ);
2549 
2550  // Extract the sign bit.
2551  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2552  DAG.getConstant(16, DL, MVT::i32));
2553  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2554  DAG.getConstant(0x8000, DL, MVT::i32));
2555 
2556  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2557  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2558 }
2559 
2561  SelectionDAG &DAG) const {
2562  SDValue Src = Op.getOperand(0);
2563 
2564  // TODO: Factor out code common with LowerFP_TO_UINT.
2565 
2566  EVT SrcVT = Src.getValueType();
2567  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2568  SDLoc DL(Op);
2569 
2570  SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2571  SDValue FpToInt32 =
2572  DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2573 
2574  return FpToInt32;
2575  }
2576 
2577  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2578  return LowerFP64_TO_INT(Op, DAG, true);
2579 
2580  return SDValue();
2581 }
2582 
2584  SelectionDAG &DAG) const {
2585  SDValue Src = Op.getOperand(0);
2586 
2587  // TODO: Factor out code common with LowerFP_TO_SINT.
2588 
2589  EVT SrcVT = Src.getValueType();
2590  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2591  SDLoc DL(Op);
2592 
2593  SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2594  SDValue FpToInt32 =
2595  DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2596 
2597  return FpToInt32;
2598  }
2599 
2600  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2601  return LowerFP64_TO_INT(Op, DAG, false);
2602 
2603  return SDValue();
2604 }
2605 
2607  SelectionDAG &DAG) const {
2608  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2609  MVT VT = Op.getSimpleValueType();
2610  MVT ScalarVT = VT.getScalarType();
2611 
2612  assert(VT.isVector());
2613 
2614  SDValue Src = Op.getOperand(0);
2615  SDLoc DL(Op);
2616 
2617  // TODO: Don't scalarize on Evergreen?
2618  unsigned NElts = VT.getVectorNumElements();
2620  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2621 
2622  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2623  for (unsigned I = 0; I < NElts; ++I)
2624  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2625 
2626  return DAG.getBuildVector(VT, DL, Args);
2627 }
2628 
2629 //===----------------------------------------------------------------------===//
2630 // Custom DAG optimizations
2631 //===----------------------------------------------------------------------===//
2632 
2633 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2634  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2635 }
2636 
2637 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2638  EVT VT = Op.getValueType();
2639  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2640  // as unsigned 24-bit values.
2642 }
2643 
2644 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2646 
2647  SelectionDAG &DAG = DCI.DAG;
2648  SDValue Op = Node24->getOperand(OpIdx);
2649  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2650  EVT VT = Op.getValueType();
2651 
2652  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2653  APInt KnownZero, KnownOne;
2654  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2655  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
2656  return true;
2657 
2658  return false;
2659 }
2660 
2661 template <typename IntTy>
2663  uint32_t Width, const SDLoc &DL) {
2664  if (Width + Offset < 32) {
2665  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2666  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2667  return DAG.getConstant(Result, DL, MVT::i32);
2668  }
2669 
2670  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2671 }
2672 
2673 static bool hasVolatileUser(SDNode *Val) {
2674  for (SDNode *U : Val->uses()) {
2675  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2676  if (M->isVolatile())
2677  return true;
2678  }
2679  }
2680 
2681  return false;
2682 }
2683 
2685  // i32 vectors are the canonical memory type.
2686  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2687  return false;
2688 
2689  if (!VT.isByteSized())
2690  return false;
2691 
2692  unsigned Size = VT.getStoreSize();
2693 
2694  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2695  return false;
2696 
2697  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2698  return false;
2699 
2700  return true;
2701 }
2702 
2703 // Replace load of an illegal type with a store of a bitcast to a friendlier
2704 // type.
2706  DAGCombinerInfo &DCI) const {
2707  if (!DCI.isBeforeLegalize())
2708  return SDValue();
2709 
2710  LoadSDNode *LN = cast<LoadSDNode>(N);
2711  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2712  return SDValue();
2713 
2714  SDLoc SL(N);
2715  SelectionDAG &DAG = DCI.DAG;
2716  EVT VT = LN->getMemoryVT();
2717 
2718  unsigned Size = VT.getStoreSize();
2719  unsigned Align = LN->getAlignment();
2720  if (Align < Size && isTypeLegal(VT)) {
2721  bool IsFast;
2722  unsigned AS = LN->getAddressSpace();
2723 
2724  // Expand unaligned loads earlier than legalization. Due to visitation order
2725  // problems during legalization, the emitted instructions to pack and unpack
2726  // the bytes again are not eliminated in the case of an unaligned copy.
2727  if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2728  if (VT.isVector())
2729  return scalarizeVectorLoad(LN, DAG);
2730 
2731  SDValue Ops[2];
2732  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2733  return DAG.getMergeValues(Ops, SDLoc(N));
2734  }
2735 
2736  if (!IsFast)
2737  return SDValue();
2738  }
2739 
2740  if (!shouldCombineMemoryType(VT))
2741  return SDValue();
2742 
2743  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2744 
2745  SDValue NewLoad
2746  = DAG.getLoad(NewVT, SL, LN->getChain(),
2747  LN->getBasePtr(), LN->getMemOperand());
2748 
2749  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2750  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2751  return SDValue(N, 0);
2752 }
2753 
2754 // Replace store of an illegal type with a store of a bitcast to a friendlier
2755 // type.
2757  DAGCombinerInfo &DCI) const {
2758  if (!DCI.isBeforeLegalize())
2759  return SDValue();
2760 
2761  StoreSDNode *SN = cast<StoreSDNode>(N);
2762  if (SN->isVolatile() || !ISD::isNormalStore(SN))
2763  return SDValue();
2764 
2765  EVT VT = SN->getMemoryVT();
2766  unsigned Size = VT.getStoreSize();
2767 
2768  SDLoc SL(N);
2769  SelectionDAG &DAG = DCI.DAG;
2770  unsigned Align = SN->getAlignment();
2771  if (Align < Size && isTypeLegal(VT)) {
2772  bool IsFast;
2773  unsigned AS = SN->getAddressSpace();
2774 
2775  // Expand unaligned stores earlier than legalization. Due to visitation
2776  // order problems during legalization, the emitted instructions to pack and
2777  // unpack the bytes again are not eliminated in the case of an unaligned
2778  // copy.
2779  if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2780  if (VT.isVector())
2781  return scalarizeVectorStore(SN, DAG);
2782 
2783  return expandUnalignedStore(SN, DAG);
2784  }
2785 
2786  if (!IsFast)
2787  return SDValue();
2788  }
2789 
2790  if (!shouldCombineMemoryType(VT))
2791  return SDValue();
2792 
2793  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2794  SDValue Val = SN->getValue();
2795 
2796  //DCI.AddToWorklist(Val.getNode());
2797 
2798  bool OtherUses = !Val.hasOneUse();
2799  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2800  if (OtherUses) {
2801  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2802  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2803  }
2804 
2805  return DAG.getStore(SN->getChain(), SL, CastVal,
2806  SN->getBasePtr(), SN->getMemOperand());
2807 }
2808 
2810  DAGCombinerInfo &DCI) const {
2812  if (!CSrc)
2813  return SDValue();
2814 
2815  const APFloat &F = CSrc->getValueAPF();
2817  APFloat::cmpResult Cmp0 = F.compare(Zero);
2818  if (Cmp0 == APFloat::cmpLessThan ||
2820  return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2821  }
2822 
2823  APFloat One(F.getSemantics(), "1.0");
2824  APFloat::cmpResult Cmp1 = F.compare(One);
2825  if (Cmp1 == APFloat::cmpGreaterThan)
2826  return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2827 
2828  return SDValue(CSrc, 0);
2829 }
2830 
2831 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2832 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2833 // issues.
2835  DAGCombinerInfo &DCI) const {
2836  SelectionDAG &DAG = DCI.DAG;
2837  SDValue N0 = N->getOperand(0);
2838 
2839  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2840  // (vt2 (truncate (assertzext vt0:x, vt1)))
2841  if (N0.getOpcode() == ISD::TRUNCATE) {
2842  SDValue N1 = N->getOperand(1);
2843  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2844  SDLoc SL(N);
2845 
2846  SDValue Src = N0.getOperand(0);
2847  EVT SrcVT = Src.getValueType();
2848  if (SrcVT.bitsGE(ExtVT)) {
2849  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2850  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2851  }
2852  }
2853 
2854  return SDValue();
2855 }
2856 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2857 /// binary operation \p Opc to it with the corresponding constant operands.
2859  DAGCombinerInfo &DCI, const SDLoc &SL,
2860  unsigned Opc, SDValue LHS,
2861  uint32_t ValLo, uint32_t ValHi) const {
2862  SelectionDAG &DAG = DCI.DAG;
2863  SDValue Lo, Hi;
2864  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2865 
2866  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2867  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2868 
2869  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2870  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2871 
2872  // Re-visit the ands. It's possible we eliminated one of them and it could
2873  // simplify the vector.
2874  DCI.AddToWorklist(Lo.getNode());
2875  DCI.AddToWorklist(Hi.getNode());
2876 
2877  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2878  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2879 }
2880 
2882  DAGCombinerInfo &DCI) const {
2883  EVT VT = N->getValueType(0);
2884 
2886  if (!RHS)
2887  return SDValue();
2888 
2889  SDValue LHS = N->getOperand(0);
2890  unsigned RHSVal = RHS->getZExtValue();
2891  if (!RHSVal)
2892  return LHS;
2893 
2894  SDLoc SL(N);
2895  SelectionDAG &DAG = DCI.DAG;
2896 
2897  switch (LHS->getOpcode()) {
2898  default:
2899  break;
2900  case ISD::ZERO_EXTEND:
2901  case ISD::SIGN_EXTEND:
2902  case ISD::ANY_EXTEND: {
2903  SDValue X = LHS->getOperand(0);
2904 
2905  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
2907  // Prefer build_vector as the canonical form if packed types are legal.
2908  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
2909  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
2910  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
2911  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
2912  }
2913 
2914  // shl (ext x) => zext (shl x), if shift does not overflow int
2915  if (VT != MVT::i64)
2916  break;
2917  KnownBits Known;
2918  DAG.computeKnownBits(X, Known);
2919  unsigned LZ = Known.countMinLeadingZeros();
2920  if (LZ < RHSVal)
2921  break;
2922  EVT XVT = X.getValueType();
2923  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2924  return DAG.getZExtOrTrunc(Shl, SL, VT);
2925  }
2926  }
2927 
2928  if (VT != MVT::i64)
2929  return SDValue();
2930 
2931  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2932 
2933  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2934  // common case, splitting this into a move and a 32-bit shift is faster and
2935  // the same code size.
2936  if (RHSVal < 32)
2937  return SDValue();
2938 
2939  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2940 
2941  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2942  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2943 
2944  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2945 
2946  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2947  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2948 }
2949 
2951  DAGCombinerInfo &DCI) const {
2952  if (N->getValueType(0) != MVT::i64)
2953  return SDValue();
2954 
2955  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2956  if (!RHS)
2957  return SDValue();
2958 
2959  SelectionDAG &DAG = DCI.DAG;
2960  SDLoc SL(N);
2961  unsigned RHSVal = RHS->getZExtValue();
2962 
2963  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2964  if (RHSVal == 32) {
2965  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2966  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2967  DAG.getConstant(31, SL, MVT::i32));
2968 
2969  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2970  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2971  }
2972 
2973  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2974  if (RHSVal == 63) {
2975  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2976  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2977  DAG.getConstant(31, SL, MVT::i32));
2978  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2979  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2980  }
2981 
2982  return SDValue();
2983 }
2984 
2986  DAGCombinerInfo &DCI) const {
2987  if (N->getValueType(0) != MVT::i64)
2988  return SDValue();
2989 
2990  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2991  if (!RHS)
2992  return SDValue();
2993 
2994  unsigned ShiftAmt = RHS->getZExtValue();
2995  if (ShiftAmt < 32)
2996  return SDValue();
2997 
2998  // srl i64:x, C for C >= 32
2999  // =>
3000  // build_pair (srl hi_32(x), C - 32), 0
3001 
3002  SelectionDAG &DAG = DCI.DAG;
3003  SDLoc SL(N);
3004 
3005  SDValue One = DAG.getConstant(1, SL, MVT::i32);
3006  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3007 
3008  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
3010  VecOp, One);
3011 
3012  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3013  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3014 
3015  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3016 
3017  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3018 }
3019 
3020 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3021 // instructions. If we only match on the legalized i64 mul expansion,
3022 // SimplifyDemandedBits will be unable to remove them because there will be
3023 // multiple uses due to the separate mul + mulh[su].
3024 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3025  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3026  if (Size <= 32) {
3027  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3028  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3029  }
3030 
3031  // Because we want to eliminate extension instructions before the
3032  // operation, we need to create a single user here (i.e. not the separate
3033  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3034 
3035  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3036 
3037  SDValue Mul = DAG.getNode(MulOpc, SL,
3038  DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3039 
3040  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3041  Mul.getValue(0), Mul.getValue(1));
3042 }
3043 
3045  DAGCombinerInfo &DCI) const {
3046  EVT VT = N->getValueType(0);
3047 
3048  unsigned Size = VT.getSizeInBits();
3049  if (VT.isVector() || Size > 64)
3050  return SDValue();
3051 
3052  // There are i16 integer mul/mad.
3054  return SDValue();
3055 
3056  SelectionDAG &DAG = DCI.DAG;
3057  SDLoc DL(N);
3058 
3059  SDValue N0 = N->getOperand(0);
3060  SDValue N1 = N->getOperand(1);
3061  SDValue Mul;
3062 
3063  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3064  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3065  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3066  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3067  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3068  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3069  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3070  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3071  } else {
3072  return SDValue();
3073  }
3074 
3075  // We need to use sext even for MUL_U24, because MUL_U24 is used
3076  // for signed multiply of 8 and 16-bit types.
3077  return DAG.getSExtOrTrunc(Mul, DL, VT);
3078 }
3079 
3081  DAGCombinerInfo &DCI) const {
3082  EVT VT = N->getValueType(0);
3083 
3084  if (!Subtarget->hasMulI24() || VT.isVector())
3085  return SDValue();
3086 
3087  SelectionDAG &DAG = DCI.DAG;
3088  SDLoc DL(N);
3089 
3090  SDValue N0 = N->getOperand(0);
3091  SDValue N1 = N->getOperand(1);
3092 
3093  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3094  return SDValue();
3095 
3096  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3097  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3098 
3099  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3100  DCI.AddToWorklist(Mulhi.getNode());
3101  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3102 }
3103 
3105  DAGCombinerInfo &DCI) const {
3106  EVT VT = N->getValueType(0);
3107 
3108  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3109  return SDValue();
3110 
3111  SelectionDAG &DAG = DCI.DAG;
3112  SDLoc DL(N);
3113 
3114  SDValue N0 = N->getOperand(0);
3115  SDValue N1 = N->getOperand(1);
3116 
3117  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3118  return SDValue();
3119 
3120  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3121  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3122 
3123  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3124  DCI.AddToWorklist(Mulhi.getNode());
3125  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3126 }
3127 
3129  SDNode *N, DAGCombinerInfo &DCI) const {
3130  SelectionDAG &DAG = DCI.DAG;
3131 
3132  // Simplify demanded bits before splitting into multiple users.
3133  if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
3134  return SDValue();
3135 
3136  SDValue N0 = N->getOperand(0);
3137  SDValue N1 = N->getOperand(1);
3138 
3139  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3140 
3141  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3142  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3143 
3144  SDLoc SL(N);
3145 
3146  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3147  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3148  return DAG.getMergeValues({ MulLo, MulHi }, SL);
3149 }
3150 
3151 static bool isNegativeOne(SDValue Val) {
3152  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3153  return C->isAllOnesValue();
3154  return false;
3155 }
3156 
3157 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3158  SDValue Op,
3159  const SDLoc &DL,
3160  unsigned Opc) const {
3161  EVT VT = Op.getValueType();
3162  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3163  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3164  LegalVT != MVT::i16))
3165  return SDValue();
3166 
3167  if (VT != MVT::i32)
3168  Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3169 
3170  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3171  if (VT != MVT::i32)
3172  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3173 
3174  return FFBX;
3175 }
3176 
3177 // The native instructions return -1 on 0 input. Optimize out a select that
3178 // produces -1 on 0.
3179 //
3180 // TODO: If zero is not undef, we could also do this if the output is compared
3181 // against the bitwidth.
3182 //
3183 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3185  SDValue LHS, SDValue RHS,
3186  DAGCombinerInfo &DCI) const {
3187  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3188  if (!CmpRhs || !CmpRhs->isNullValue())
3189  return SDValue();
3190 
3191  SelectionDAG &DAG = DCI.DAG;
3192  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3193  SDValue CmpLHS = Cond.getOperand(0);
3194 
3195  unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3197 
3198  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3199  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3200  if (CCOpcode == ISD::SETEQ &&
3201  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3202  RHS.getOperand(0) == CmpLHS &&
3203  isNegativeOne(LHS)) {
3204  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3205  }
3206 
3207  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3208  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3209  if (CCOpcode == ISD::SETNE &&
3210  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3211  LHS.getOperand(0) == CmpLHS &&
3212  isNegativeOne(RHS)) {
3213  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3214  }
3215 
3216  return SDValue();
3217 }
3218 
3220  unsigned Op,
3221  const SDLoc &SL,
3222  SDValue Cond,
3223  SDValue N1,
3224  SDValue N2) {
3225  SelectionDAG &DAG = DCI.DAG;
3226  EVT VT = N1.getValueType();
3227 
3228  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3229  N1.getOperand(0), N2.getOperand(0));
3230  DCI.AddToWorklist(NewSelect.getNode());
3231  return DAG.getNode(Op, SL, VT, NewSelect);
3232 }
3233 
3234 // Pull a free FP operation out of a select so it may fold into uses.
3235 //
3236 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3237 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3238 //
3239 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3240 // select c, (fabs x), +k -> fabs (select c, x, k)
3242  SDValue N) {
3243  SelectionDAG &DAG = DCI.DAG;
3244  SDValue Cond = N.getOperand(0);
3245  SDValue LHS = N.getOperand(1);
3246  SDValue RHS = N.getOperand(2);
3247 
3248  EVT VT = N.getValueType();
3249  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3250  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3251  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3252  SDLoc(N), Cond, LHS, RHS);
3253  }
3254 
3255  bool Inv = false;
3256  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3257  std::swap(LHS, RHS);
3258  Inv = true;
3259  }
3260 
3261  // TODO: Support vector constants.
3263  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3264  SDLoc SL(N);
3265  // If one side is an fneg/fabs and the other is a constant, we can push the
3266  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3267  SDValue NewLHS = LHS.getOperand(0);
3268  SDValue NewRHS = RHS;
3269 
3270  // Careful: if the neg can be folded up, don't try to pull it back down.
3271  bool ShouldFoldNeg = true;
3272 
3273  if (NewLHS.hasOneUse()) {
3274  unsigned Opc = NewLHS.getOpcode();
3275  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3276  ShouldFoldNeg = false;
3277  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3278  ShouldFoldNeg = false;
3279  }
3280 
3281  if (ShouldFoldNeg) {
3282  if (LHS.getOpcode() == ISD::FNEG)
3283  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3284  else if (CRHS->isNegative())
3285  return SDValue();
3286 
3287  if (Inv)
3288  std::swap(NewLHS, NewRHS);
3289 
3290  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3291  Cond, NewLHS, NewRHS);
3292  DCI.AddToWorklist(NewSelect.getNode());
3293  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3294  }
3295  }
3296 
3297  return SDValue();
3298 }
3299 
3300 
3302  DAGCombinerInfo &DCI) const {
3303  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3304  return Folded;
3305 
3306  SDValue Cond = N->getOperand(0);
3307  if (Cond.getOpcode() != ISD::SETCC)
3308  return SDValue();
3309 
3310  EVT VT = N->getValueType(0);
3311  SDValue LHS = Cond.getOperand(0);
3312  SDValue RHS = Cond.getOperand(1);
3313  SDValue CC = Cond.getOperand(2);
3314 
3315  SDValue True = N->getOperand(1);
3316  SDValue False = N->getOperand(2);
3317 
3318  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3319  SelectionDAG &DAG = DCI.DAG;
3320  if ((DAG.isConstantValueOfAnyType(True) ||
3321  DAG.isConstantValueOfAnyType(True)) &&
3322  (!DAG.isConstantValueOfAnyType(False) &&
3323  !DAG.isConstantValueOfAnyType(False))) {
3324  // Swap cmp + select pair to move constant to false input.
3325  // This will allow using VOPC cndmasks more often.
3326  // select (setcc x, y), k, x -> select (setcc y, x) x, x
3327 
3328  SDLoc SL(N);
3329  ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3330  LHS.getValueType().isInteger());
3331 
3332  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3333  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3334  }
3335 
3336  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3337  SDValue MinMax
3338  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3339  // Revisit this node so we can catch min3/max3/med3 patterns.
3340  //DCI.AddToWorklist(MinMax.getNode());
3341  return MinMax;
3342  }
3343  }
3344 
3345  // There's no reason to not do this if the condition has other uses.
3346  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3347 }
3348 
3350  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3351  return C->isZero() && !C->isNegative();
3352  return false;
3353 }
3354 
3355 static unsigned inverseMinMax(unsigned Opc) {
3356  switch (Opc) {
3357  case ISD::FMAXNUM:
3358  return ISD::FMINNUM;
3359  case ISD::FMINNUM:
3360  return ISD::FMAXNUM;
3362  return AMDGPUISD::FMIN_LEGACY;
3364  return AMDGPUISD::FMAX_LEGACY;
3365  default:
3366  llvm_unreachable("invalid min/max opcode");
3367  }
3368 }
3369 
3371  DAGCombinerInfo &DCI) const {
3372  SelectionDAG &DAG = DCI.DAG;
3373  SDValue N0 = N->getOperand(0);
3374  EVT VT = N->getValueType(0);
3375 
3376  unsigned Opc = N0.getOpcode();
3377 
3378  // If the input has multiple uses and we can either fold the negate down, or
3379  // the other uses cannot, give up. This both prevents unprofitable
3380  // transformations and infinite loops: we won't repeatedly try to fold around
3381  // a negate that has no 'good' form.
3382  if (N0.hasOneUse()) {
3383  // This may be able to fold into the source, but at a code size cost. Don't
3384  // fold if the fold into the user is free.
3385  if (allUsesHaveSourceMods(N, 0))
3386  return SDValue();
3387  } else {
3388  if (fnegFoldsIntoOp(Opc) &&
3390  return SDValue();
3391  }
3392 
3393  SDLoc SL(N);
3394  switch (Opc) {
3395  case ISD::FADD: {
3396  if (!mayIgnoreSignedZero(N0))
3397  return SDValue();
3398 
3399  // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3400  SDValue LHS = N0.getOperand(0);
3401  SDValue RHS = N0.getOperand(1);
3402 
3403  if (LHS.getOpcode() != ISD::FNEG)
3404  LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3405  else
3406  LHS = LHS.getOperand(0);
3407 
3408  if (RHS.getOpcode() != ISD::FNEG)
3409  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3410  else
3411  RHS = RHS.getOperand(0);
3412 
3413  SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3414  if (!N0.hasOneUse())
3415  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3416  return Res;
3417  }
3418  case ISD::FMUL:
3419  case AMDGPUISD::FMUL_LEGACY: {
3420  // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3421  // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3422  SDValue LHS = N0.getOperand(0);
3423  SDValue RHS = N0.getOperand(1);
3424 
3425  if (LHS.getOpcode() == ISD::FNEG)
3426  LHS = LHS.getOperand(0);
3427  else if (RHS.getOpcode() == ISD::FNEG)
3428  RHS = RHS.getOperand(0);
3429  else
3430  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3431 
3432  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3433  if (!N0.hasOneUse())
3434  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3435  return Res;
3436  }
3437  case ISD::FMA:
3438  case ISD::FMAD: {
3439  if (!mayIgnoreSignedZero(N0))
3440  return SDValue();
3441 
3442  // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3443  SDValue LHS = N0.getOperand(0);
3444  SDValue MHS = N0.getOperand(1);
3445  SDValue RHS = N0.getOperand(2);
3446 
3447  if (LHS.getOpcode() == ISD::FNEG)
3448  LHS = LHS.getOperand(0);
3449  else if (MHS.getOpcode() == ISD::FNEG)
3450  MHS = MHS.getOperand(0);
3451  else
3452  MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3453 
3454  if (RHS.getOpcode() != ISD::FNEG)
3455  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3456  else
3457  RHS = RHS.getOperand(0);
3458 
3459  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3460  if (!N0.hasOneUse())
3461  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3462  return Res;
3463  }
3464  case ISD::FMAXNUM:
3465  case ISD::FMINNUM:
3467  case AMDGPUISD::FMIN_LEGACY: {
3468  // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3469  // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3470  // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3471  // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3472 
3473  SDValue LHS = N0.getOperand(0);
3474  SDValue RHS = N0.getOperand(1);
3475 
3476  // 0 doesn't have a negated inline immediate.
3477  // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3478  // operations.
3479  if (isConstantFPZero(RHS))
3480  return SDValue();
3481 
3482  SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3483  SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3484  unsigned Opposite = inverseMinMax(Opc);
3485 
3486  SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3487  if (!N0.hasOneUse())
3488  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3489  return Res;
3490  }
3491  case ISD::FP_EXTEND:
3492  case ISD::FTRUNC:
3493  case ISD::FRINT:
3494  case ISD::FNEARBYINT: // XXX - Should fround be handled?
3495  case ISD::FSIN:
3496  case AMDGPUISD::RCP:
3497  case AMDGPUISD::RCP_LEGACY:
3498  case AMDGPUISD::SIN_HW: {
3499  SDValue CvtSrc = N0.getOperand(0);
3500  if (CvtSrc.getOpcode() == ISD::FNEG) {
3501  // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3502  // (fneg (rcp (fneg x))) -> (rcp x)
3503  return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3504  }
3505 
3506  if (!N0.hasOneUse())
3507  return SDValue();
3508 
3509  // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3510  // (fneg (rcp x)) -> (rcp (fneg x))
3511  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3512  return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3513  }
3514  case ISD::FP_ROUND: {
3515  SDValue CvtSrc = N0.getOperand(0);
3516 
3517  if (CvtSrc.getOpcode() == ISD::FNEG) {
3518  // (fneg (fp_round (fneg x))) -> (fp_round x)
3519  return DAG.getNode(ISD::FP_ROUND, SL, VT,
3520  CvtSrc.getOperand(0), N0.getOperand(1));
3521  }
3522 
3523  if (!N0.hasOneUse())
3524  return SDValue();
3525 
3526  // (fneg (fp_round x)) -> (fp_round (fneg x))
3527  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3528  return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3529  }
3530  case ISD::FP16_TO_FP: {
3531  // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3532  // f16, but legalization of f16 fneg ends up pulling it out of the source.
3533  // Put the fneg back as a legal source operation that can be matched later.
3534  SDLoc SL(N);
3535 
3536  SDValue Src = N0.getOperand(0);
3537  EVT SrcVT = Src.getValueType();
3538 
3539  // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3540  SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3541  DAG.getConstant(0x8000, SL, SrcVT));
3542  return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3543  }
3544  default:
3545  return SDValue();
3546  }
3547 }
3548 
3550  DAGCombinerInfo &DCI) const {
3551  SelectionDAG &DAG = DCI.DAG;
3552  SDValue N0 = N->getOperand(0);
3553 
3554  if (!N0.hasOneUse())
3555  return SDValue();
3556 
3557  switch (N0.getOpcode()) {
3558  case ISD::FP16_TO_FP: {
3559  assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3560  SDLoc SL(N);
3561  SDValue Src = N0.getOperand(0);
3562  EVT SrcVT = Src.getValueType();
3563 
3564  // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3565  SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3566  DAG.getConstant(0x7fff, SL, SrcVT));
3567  return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3568  }
3569  default:
3570  return SDValue();
3571  }
3572 }
3573 
3575  DAGCombinerInfo &DCI) const {
3576  SelectionDAG &DAG = DCI.DAG;
3577  SDLoc DL(N);
3578 
3579  switch(N->getOpcode()) {
3580  default:
3581  break;
3582  case ISD::BITCAST: {
3583  EVT DestVT = N->getValueType(0);
3584 
3585  // Push casts through vector builds. This helps avoid emitting a large
3586  // number of copies when materializing floating point vector constants.
3587  //
3588  // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3589  // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3590  if (DestVT.isVector()) {
3591  SDValue Src = N->getOperand(0);
3592  if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3593  EVT SrcVT = Src.getValueType();
3594  unsigned NElts = DestVT.getVectorNumElements();
3595 
3596  if (SrcVT.getVectorNumElements() == NElts) {
3597  EVT DestEltVT = DestVT.getVectorElementType();
3598 
3599  SmallVector<SDValue, 8> CastedElts;
3600  SDLoc SL(N);
3601  for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3602  SDValue Elt = Src.getOperand(I);
3603  CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3604  }
3605 
3606  return DAG.getBuildVector(DestVT, SL, CastedElts);
3607  }
3608  }
3609  }
3610 
3611  if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3612  break;
3613 
3614  // Fold bitcasts of constants.
3615  //
3616  // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3617  // TODO: Generalize and move to DAGCombiner
3618  SDValue Src = N->getOperand(0);
3619  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3620  assert(Src.getValueType() == MVT::i64);
3621  SDLoc SL(N);
3622  uint64_t CVal = C->getZExtValue();
3623  return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3624  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3625  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3626  }
3627 
3628  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3629  const APInt &Val = C->getValueAPF().bitcastToAPInt();
3630  SDLoc SL(N);
3631  uint64_t CVal = Val.getZExtValue();
3633  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3634  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3635 
3636  return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3637  }
3638 
3639  break;
3640  }
3641  case ISD::SHL: {
3643  break;
3644 
3645  return performShlCombine(N, DCI);
3646  }
3647  case ISD::SRL: {
3649  break;
3650 
3651  return performSrlCombine(N, DCI);
3652  }
3653  case ISD::SRA: {
3655  break;
3656 
3657  return performSraCombine(N, DCI);
3658  }
3659  case ISD::MUL:
3660  return performMulCombine(N, DCI);
3661  case ISD::MULHS:
3662  return performMulhsCombine(N, DCI);
3663  case ISD::MULHU:
3664  return performMulhuCombine(N, DCI);
3665  case AMDGPUISD::MUL_I24:
3666  case AMDGPUISD::MUL_U24:
3667  case AMDGPUISD::MULHI_I24:
3668  case AMDGPUISD::MULHI_U24: {
3669  // If the first call to simplify is successfull, then N may end up being
3670  // deleted, so we shouldn't call simplifyI24 again.
3671  simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3672  return SDValue();
3673  }
3676  return performMulLoHi24Combine(N, DCI);
3677  case ISD::SELECT:
3678  return performSelectCombine(N, DCI);
3679  case ISD::FNEG:
3680  return performFNegCombine(N, DCI);
3681  case ISD::FABS:
3682  return performFAbsCombine(N, DCI);
3683  case AMDGPUISD::BFE_I32:
3684  case AMDGPUISD::BFE_U32: {
3685  assert(!N->getValueType(0).isVector() &&
3686  "Vector handling of BFE not implemented");
3688  if (!Width)
3689  break;
3690 
3691  uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3692  if (WidthVal == 0)
3693  return DAG.getConstant(0, DL, MVT::i32);
3694 
3696  if (!Offset)
3697  break;
3698 
3699  SDValue BitsFrom = N->getOperand(0);
3700  uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3701 
3702  bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3703 
3704  if (OffsetVal == 0) {
3705  // This is already sign / zero extended, so try to fold away extra BFEs.
3706  unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3707 
3708  unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3709  if (OpSignBits >= SignBits)
3710  return BitsFrom;
3711 
3712  EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3713  if (Signed) {
3714  // This is a sign_extend_inreg. Replace it to take advantage of existing
3715  // DAG Combines. If not eliminated, we will match back to BFE during
3716  // selection.
3717 
3718  // TODO: The sext_inreg of extended types ends, although we can could
3719  // handle them in a single BFE.
3720  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3721  DAG.getValueType(SmallVT));
3722  }
3723 
3724  return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3725  }
3726 
3727  if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3728  if (Signed) {
3729  return constantFoldBFE<int32_t>(DAG,
3730  CVal->getSExtValue(),
3731  OffsetVal,
3732  WidthVal,
3733  DL);
3734  }
3735 
3736  return constantFoldBFE<uint32_t>(DAG,
3737  CVal->getZExtValue(),
3738  OffsetVal,
3739  WidthVal,
3740  DL);
3741  }
3742 
3743  if ((OffsetVal + WidthVal) >= 32 &&
3744  !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
3745  SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3746  return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3747  BitsFrom, ShiftVal);
3748  }
3749 
3750  if (BitsFrom.hasOneUse()) {
3751  APInt Demanded = APInt::getBitsSet(32,
3752  OffsetVal,
3753  OffsetVal + WidthVal);
3754 
3755  KnownBits Known;
3757  !DCI.isBeforeLegalizeOps());
3758  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3759  if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
3760  TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
3761  DCI.CommitTargetLoweringOpt(TLO);
3762  }
3763  }
3764 
3765  break;
3766  }
3767  case ISD::LOAD:
3768  return performLoadCombine(N, DCI);
3769  case ISD::STORE:
3770  return performStoreCombine(N, DCI);
3771  case AMDGPUISD::CLAMP:
3772  return performClampCombine(N, DCI);
3773  case AMDGPUISD::RCP: {
3774  if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3775  // XXX - Should this flush denormals?
3776  const APFloat &Val = CFP->getValueAPF();
3777  APFloat One(Val.getSemantics(), "1.0");
3778  return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3779  }
3780 
3781  break;
3782  }
3783  case ISD::AssertZext:
3784  case ISD::AssertSext:
3785  return performAssertSZExtCombine(N, DCI);
3786  }
3787  return SDValue();
3788 }
3789 
3790 //===----------------------------------------------------------------------===//
3791 // Helper functions
3792 //===----------------------------------------------------------------------===//
3793 
3795  const TargetRegisterClass *RC,
3796  unsigned Reg, EVT VT,
3797  const SDLoc &SL,
3798  bool RawReg) const {
3799  MachineFunction &MF = DAG.getMachineFunction();
3801  unsigned VReg;
3802 
3803  if (!MRI.isLiveIn(Reg)) {
3804  VReg = MRI.createVirtualRegister(RC);
3805  MRI.addLiveIn(Reg, VReg);
3806  } else {
3807  VReg = MRI.getLiveInVirtReg(Reg);
3808  }
3809 
3810  if (RawReg)
3811  return DAG.getRegister(VReg, VT);
3812 
3813  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
3814 }
3815 
3817  EVT VT,
3818  const SDLoc &SL,
3819  int64_t Offset) const {
3820  MachineFunction &MF = DAG.getMachineFunction();
3821  MachineFrameInfo &MFI = MF.getFrameInfo();
3822 
3823  int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
3824  auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
3825  SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
3826 
3827  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
3830 }
3831 
3833  const SDLoc &SL,
3834  SDValue Chain,
3835  SDValue StackPtr,
3836  SDValue ArgVal,
3837  int64_t Offset) const {
3838  MachineFunction &MF = DAG.getMachineFunction();
3839  MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
3840 
3841  SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
3842  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
3844  return Store;
3845 }
3846 
3848  const TargetRegisterClass *RC,
3849  EVT VT, const SDLoc &SL,
3850  const ArgDescriptor &Arg) const {
3851  assert(Arg && "Attempting to load missing argument");
3852 
3853  if (Arg.isRegister())
3854  return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
3855  return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
3856 }
3857 
3859  const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3860  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3861  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3862  switch (Param) {
3863  case GRID_DIM:
3864  return ArgOffset;
3865  case GRID_OFFSET:
3866  return ArgOffset + 4;
3867  }
3868  llvm_unreachable("unexpected implicit parameter type");
3869 }
3870 
3871 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3872 
3873 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3874  switch ((AMDGPUISD::NodeType)Opcode) {
3875  case AMDGPUISD::FIRST_NUMBER: break;
3876  // AMDIL DAG nodes
3879 
3880  // AMDGPU DAG nodes
4013  // Basic sample.
4024  // Sample with comparison.
4035  // Sample with offsets.
4045