LLVM  9.0.0svn
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
16 #define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
17 #define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
18 
19 #include "AMDGPUISelLowering.h"
20 #include "AMDGPU.h"
21 #include "AMDGPUCallLowering.h"
22 #include "AMDGPUFrameLowering.h"
23 #include "AMDGPURegisterInfo.h"
24 #include "AMDGPUSubtarget.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "Utils/AMDGPUBaseInfo.h"
28 #include "SIInstrInfo.h"
29 #include "SIMachineFunctionInfo.h"
31 #include "llvm/CodeGen/Analysis.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DiagnosticInfo.h"
39 #include "llvm/Support/KnownBits.h"
40 using namespace llvm;
41 
42 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
43  CCValAssign::LocInfo LocInfo,
44  ISD::ArgFlagsTy ArgFlags, CCState &State,
45  const TargetRegisterClass *RC,
46  unsigned NumRegs) {
47  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
48  unsigned RegResult = State.AllocateReg(RegList);
49  if (RegResult == AMDGPU::NoRegister)
50  return false;
51 
52  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
53  return true;
54 }
55 
56 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
57  CCValAssign::LocInfo LocInfo,
58  ISD::ArgFlagsTy ArgFlags, CCState &State) {
59  switch (LocVT.SimpleTy) {
60  case MVT::i64:
61  case MVT::f64:
62  case MVT::v2i32:
63  case MVT::v2f32:
64  case MVT::v4i16:
65  case MVT::v4f16: {
66  // Up to SGPR0-SGPR105
67  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
68  &AMDGPU::SGPR_64RegClass, 53);
69  }
70  default:
71  return false;
72  }
73 }
74 
75 // Allocate up to VGPR31.
76 //
77 // TODO: Since there are no VGPR alignent requirements would it be better to
78 // split into individual scalar registers?
79 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
80  CCValAssign::LocInfo LocInfo,
81  ISD::ArgFlagsTy ArgFlags, CCState &State) {
82  switch (LocVT.SimpleTy) {
83  case MVT::i64:
84  case MVT::f64:
85  case MVT::v2i32:
86  case MVT::v2f32:
87  case MVT::v4i16:
88  case MVT::v4f16: {
89  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
90  &AMDGPU::VReg_64RegClass, 31);
91  }
92  case MVT::v4i32:
93  case MVT::v4f32:
94  case MVT::v2i64:
95  case MVT::v2f64: {
96  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
97  &AMDGPU::VReg_128RegClass, 29);
98  }
99  case MVT::v8i32:
100  case MVT::v8f32: {
101  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
102  &AMDGPU::VReg_256RegClass, 25);
103 
104  }
105  case MVT::v16i32:
106  case MVT::v16f32: {
107  return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
108  &AMDGPU::VReg_512RegClass, 17);
109 
110  }
111  default:
112  return false;
113  }
114 }
115 
116 #include "AMDGPUGenCallingConv.inc"
117 
118 // Find a larger type to do a load / store of a vector with.
120  unsigned StoreSize = VT.getStoreSizeInBits();
121  if (StoreSize <= 32)
122  return EVT::getIntegerVT(Ctx, StoreSize);
123 
124  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
125  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
126 }
127 
129  EVT VT = Op.getValueType();
130  KnownBits Known = DAG.computeKnownBits(Op);
131  return VT.getSizeInBits() - Known.countMinLeadingZeros();
132 }
133 
135  EVT VT = Op.getValueType();
136 
137  // In order for this to be a signed 24-bit value, bit 23, must
138  // be a sign bit.
139  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
140 }
141 
143  const AMDGPUSubtarget &STI)
144  : TargetLowering(TM), Subtarget(&STI) {
145  // Lower floating point store/load to integer store/load to reduce the number
146  // of patterns in tablegen.
149 
152 
155 
158 
161 
164 
167 
170 
173 
176 
179 
180  // There are no 64-bit extloads. These should be done as a 32-bit extload and
181  // an extension to 64-bit.
182  for (MVT VT : MVT::integer_valuetypes()) {
186  }
187 
188  for (MVT VT : MVT::integer_valuetypes()) {
189  if (VT == MVT::i64)
190  continue;
191 
196 
201 
206  }
207 
208  for (MVT VT : MVT::integer_vector_valuetypes()) {
221  }
222 
227 
232 
237 
240 
243 
246 
249 
252 
255 
258 
261 
264 
267 
270 
275 
280 
285 
288 
291 
294 
297 
298 
303 
306 
307  // This is totally unsupported, just custom lower to produce an error.
309 
310  // Library functions. These default to Expand, but we have instructions
311  // for them.
322 
325 
329 
330 
333 
336 
337  // Expand to fneg + fadd.
339 
358 
362 
363  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
364  for (MVT VT : ScalarIntVTs) {
365  // These should use [SU]DIVREM, so set them to expand
370 
371  // GPU does not have divrem function for signed or unsigned.
374 
375  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
378 
382 
383  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
388  }
389 
390  // The hardware supports 32-bit ROTR, but not ROTL.
392  setOperationAction(ISD::ROTL, MVT::i64, Expand);
393  setOperationAction(ISD::ROTR, MVT::i64, Expand);
394 
395  setOperationAction(ISD::MUL, MVT::i64, Expand);
403 
408 
409  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
411  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
413 
414  static const MVT::SimpleValueType VectorIntTypes[] = {
416  };
417 
418  for (MVT VT : VectorIntTypes) {
419  // Expand the following operations for the current type by default.
454  }
455 
456  static const MVT::SimpleValueType FloatVectorTypes[] = {
458  };
459 
460  for (MVT VT : FloatVectorTypes) {
491  }
492 
493  // This causes using an unrolled select operation rather than expansion with
494  // bit operations. This is in general better, but the alternative using BFI
495  // instructions may be better if the select sources are SGPRs.
498 
501 
504 
505  setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
506  AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
507 
508  // There are no libcalls of any kind.
509  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
510  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
511 
514 
516  setJumpIsExpensive(true);
517 
518  // FIXME: This is only partially true. If we have to do vector compares, any
519  // SGPR pair can be a condition register. If we have a uniform condition, we
520  // are better off doing SALU operations, where there is only one SCC. For now,
521  // we don't have a way of knowing during instruction selection if a condition
522  // will be uniform and we always use vector compares. Assume we are using
523  // vector compares until that is fixed.
525 
528 
530 
531  // We want to find all load dependencies for long chains of stores to enable
532  // merging into very wide vectors. The problem is with vectors with > 4
533  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
534  // vectors are a legal type, even though we have to split the loads
535  // usually. When we can more precisely specify load legality per address
536  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
537  // smarter so that they can figure out what to do in 2 iterations without all
538  // N > 4 stores on the same chain.
540 
541  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
542  // about these during lowering.
543  MaxStoresPerMemcpy = 0xffffffff;
544  MaxStoresPerMemmove = 0xffffffff;
545  MaxStoresPerMemset = 0xffffffff;
546 
564 }
565 
566 //===----------------------------------------------------------------------===//
567 // Target Information
568 //===----------------------------------------------------------------------===//
569 
571 static bool fnegFoldsIntoOp(unsigned Opc) {
572  switch (Opc) {
573  case ISD::FADD:
574  case ISD::FSUB:
575  case ISD::FMUL:
576  case ISD::FMA:
577  case ISD::FMAD:
578  case ISD::FMINNUM:
579  case ISD::FMAXNUM:
580  case ISD::FMINNUM_IEEE:
581  case ISD::FMAXNUM_IEEE:
582  case ISD::FSIN:
583  case ISD::FTRUNC:
584  case ISD::FRINT:
585  case ISD::FNEARBYINT:
586  case ISD::FCANONICALIZE:
587  case AMDGPUISD::RCP:
590  case AMDGPUISD::SIN_HW:
594  case AMDGPUISD::FMED3:
595  return true;
596  default:
597  return false;
598  }
599 }
600 
601 /// \p returns true if the operation will definitely need to use a 64-bit
602 /// encoding, and thus will use a VOP3 encoding regardless of the source
603 /// modifiers.
605 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
606  return N->getNumOperands() > 2 || VT == MVT::f64;
607 }
608 
609 // Most FP instructions support source modifiers, but this could be refined
610 // slightly.
612 static bool hasSourceMods(const SDNode *N) {
613  if (isa<MemSDNode>(N))
614  return false;
615 
616  switch (N->getOpcode()) {
617  case ISD::CopyToReg:
618  case ISD::SELECT:
619  case ISD::FDIV:
620  case ISD::FREM:
621  case ISD::INLINEASM:
622  case ISD::INLINEASM_BR:
626 
627  // TODO: Should really be looking at the users of the bitcast. These are
628  // problematic because bitcasts are used to legalize all stores to integer
629  // types.
630  case ISD::BITCAST:
631  return false;
632  default:
633  return true;
634  }
635 }
636 
638  unsigned CostThreshold) {
639  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
640  // it is truly free to use a source modifier in all cases. If there are
641  // multiple users but for each one will necessitate using VOP3, there will be
642  // a code size increase. Try to avoid increasing code size unless we know it
643  // will save on the instruction count.
644  unsigned NumMayIncreaseSize = 0;
645  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
646 
647  // XXX - Should this limit number of uses to check?
648  for (const SDNode *U : N->uses()) {
649  if (!hasSourceMods(U))
650  return false;
651 
652  if (!opMustUseVOP3Encoding(U, VT)) {
653  if (++NumMayIncreaseSize > CostThreshold)
654  return false;
655  }
656  }
657 
658  return true;
659 }
660 
662  return MVT::i32;
663 }
664 
666  return true;
667 }
668 
669 // The backend supports 32 and 64 bit floating point immediates.
670 // FIXME: Why are we reporting vectors of FP immediates as legal?
672  bool ForCodeSize) const {
673  EVT ScalarVT = VT.getScalarType();
674  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
675  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
676 }
677 
678 // We don't want to shrink f64 / f32 constants.
680  EVT ScalarVT = VT.getScalarType();
681  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
682 }
683 
685  ISD::LoadExtType ExtTy,
686  EVT NewVT) const {
687  // TODO: This may be worth removing. Check regression tests for diffs.
688  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
689  return false;
690 
691  unsigned NewSize = NewVT.getStoreSizeInBits();
692 
693  // If we are reducing to a 32-bit load, this is always better.
694  if (NewSize == 32)
695  return true;
696 
697  EVT OldVT = N->getValueType(0);
698  unsigned OldSize = OldVT.getStoreSizeInBits();
699 
700  MemSDNode *MN = cast<MemSDNode>(N);
701  unsigned AS = MN->getAddressSpace();
702  // Do not shrink an aligned scalar load to sub-dword.
703  // Scalar engine cannot do sub-dword loads.
704  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
707  (isa<LoadSDNode>(N) &&
708  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
710  return false;
711 
712  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
713  // extloads, so doing one requires using a buffer_load. In cases where we
714  // still couldn't use a scalar load, using the wider load shouldn't really
715  // hurt anything.
716 
717  // If the old size already had to be an extload, there's no harm in continuing
718  // to reduce the width.
719  return (OldSize < 32);
720 }
721 
723  EVT CastTy) const {
724 
725  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
726 
727  if (LoadTy.getScalarType() == MVT::i32)
728  return false;
729 
730  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
731  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
732 
733  return (LScalarSize < CastScalarSize) ||
734  (CastScalarSize >= 32);
735 }
736 
737 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
738 // profitable with the expansion for 64-bit since it's generally good to
739 // speculate things.
740 // FIXME: These should really have the size as a parameter.
742  return true;
743 }
744 
746  return true;
747 }
748 
750  switch (N->getOpcode()) {
751  default:
752  return false;
753  case ISD::EntryToken:
754  case ISD::TokenFactor:
755  return true;
757  {
758  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
759  switch (IntrID) {
760  default:
761  return false;
762  case Intrinsic::amdgcn_readfirstlane:
763  case Intrinsic::amdgcn_readlane:
764  return true;
765  }
766  }
767  break;
768  case ISD::LOAD:
769  {
770  const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
771  if (L->getMemOperand()->getAddrSpace()
773  return true;
774  return false;
775  }
776  break;
777  }
778 }
779 
780 //===---------------------------------------------------------------------===//
781 // Target Properties
782 //===---------------------------------------------------------------------===//
783 
785  assert(VT.isFloatingPoint());
786 
787  // Packed operations do not have a fabs modifier.
788  return VT == MVT::f32 || VT == MVT::f64 ||
789  (Subtarget->has16BitInsts() && VT == MVT::f16);
790 }
791 
793  assert(VT.isFloatingPoint());
794  return VT == MVT::f32 || VT == MVT::f64 ||
795  (Subtarget->has16BitInsts() && VT == MVT::f16) ||
796  (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
797 }
798 
800  unsigned NumElem,
801  unsigned AS) const {
802  return true;
803 }
804 
806  // There are few operations which truly have vector input operands. Any vector
807  // operation is going to involve operations on each component, and a
808  // build_vector will be a copy per element, so it always makes sense to use a
809  // build_vector input in place of the extracted element to avoid a copy into a
810  // super register.
811  //
812  // We should probably only do this if all users are extracts only, but this
813  // should be the common case.
814  return true;
815 }
816 
818  // Truncate is just accessing a subregister.
819 
820  unsigned SrcSize = Source.getSizeInBits();
821  unsigned DestSize = Dest.getSizeInBits();
822 
823  return DestSize < SrcSize && DestSize % 32 == 0 ;
824 }
825 
827  // Truncate is just accessing a subregister.
828 
829  unsigned SrcSize = Source->getScalarSizeInBits();
830  unsigned DestSize = Dest->getScalarSizeInBits();
831 
832  if (DestSize== 16 && Subtarget->has16BitInsts())
833  return SrcSize >= 32;
834 
835  return DestSize < SrcSize && DestSize % 32 == 0;
836 }
837 
838 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
839  unsigned SrcSize = Src->getScalarSizeInBits();
840  unsigned DestSize = Dest->getScalarSizeInBits();
841 
842  if (SrcSize == 16 && Subtarget->has16BitInsts())
843  return DestSize >= 32;
844 
845  return SrcSize == 32 && DestSize == 64;
846 }
847 
848 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
849  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
850  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
851  // this will enable reducing 64-bit operations the 32-bit, which is always
852  // good.
853 
854  if (Src == MVT::i16)
855  return Dest == MVT::i32 ||Dest == MVT::i64 ;
856 
857  return Src == MVT::i32 && Dest == MVT::i64;
858 }
859 
861  return isZExtFree(Val.getValueType(), VT2);
862 }
863 
865  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
866  // limited number of native 64-bit operations. Shrinking an operation to fit
867  // in a single 32-bit register should always be helpful. As currently used,
868  // this is much less general than the name suggests, and is only used in
869  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
870  // not profitable, and may actually be harmful.
871  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
872 }
873 
874 //===---------------------------------------------------------------------===//
875 // TargetLowering Callbacks
876 //===---------------------------------------------------------------------===//
877 
879  bool IsVarArg) {
880  switch (CC) {
888  return CC_AMDGPU;
889  case CallingConv::C:
890  case CallingConv::Fast:
891  case CallingConv::Cold:
892  return CC_AMDGPU_Func;
895  default:
896  report_fatal_error("Unsupported calling convention for call");
897  }
898 }
899 
901  bool IsVarArg) {
902  switch (CC) {
905  llvm_unreachable("kernels should not be handled here");
913  return RetCC_SI_Shader;
914  case CallingConv::C:
915  case CallingConv::Fast:
916  case CallingConv::Cold:
917  return RetCC_AMDGPU_Func;
918  default:
919  report_fatal_error("Unsupported calling convention.");
920  }
921 }
922 
923 /// The SelectionDAGBuilder will automatically promote function arguments
924 /// with illegal types. However, this does not work for the AMDGPU targets
925 /// since the function arguments are stored in memory as these illegal types.
926 /// In order to handle this properly we need to get the original types sizes
927 /// from the LLVM IR Function and fixup the ISD:InputArg values before
928 /// passing them to AnalyzeFormalArguments()
929 
930 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
931 /// input values across multiple registers. Each item in the Ins array
932 /// represents a single value that will be stored in registers. Ins[x].VT is
933 /// the value type of the value that will be stored in the register, so
934 /// whatever SDNode we lower the argument to needs to be this type.
935 ///
936 /// In order to correctly lower the arguments we need to know the size of each
937 /// argument. Since Ins[x].VT gives us the size of the register that will
938 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
939 /// for the orignal function argument so that we can deduce the correct memory
940 /// type to use for Ins[x]. In most cases the correct memory type will be
941 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
942 /// we have a kernel argument of type v8i8, this argument will be split into
943 /// 8 parts and each part will be represented by its own item in the Ins array.
944 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
945 /// the argument before it was split. From this, we deduce that the memory type
946 /// for each individual part is i8. We pass the memory type as LocVT to the
947 /// calling convention analysis function and the register type (Ins[x].VT) as
948 /// the ValVT.
950  CCState &State,
951  const SmallVectorImpl<ISD::InputArg> &Ins) const {
952  const MachineFunction &MF = State.getMachineFunction();
953  const Function &Fn = MF.getFunction();
954  LLVMContext &Ctx = Fn.getParent()->getContext();
956  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
958 
959  unsigned MaxAlign = 1;
960  uint64_t ExplicitArgOffset = 0;
961  const DataLayout &DL = Fn.getParent()->getDataLayout();
962 
963  unsigned InIndex = 0;
964 
965  for (const Argument &Arg : Fn.args()) {
966  Type *BaseArgTy = Arg.getType();
967  unsigned Align = DL.getABITypeAlignment(BaseArgTy);
968  MaxAlign = std::max(Align, MaxAlign);
969  unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
970 
971  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
972  ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
973 
974  // We're basically throwing away everything passed into us and starting over
975  // to get accurate in-memory offsets. The "PartOffset" is completely useless
976  // to us as computed in Ins.
977  //
978  // We also need to figure out what type legalization is trying to do to get
979  // the correct memory offsets.
980 
981  SmallVector<EVT, 16> ValueVTs;
983  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
984 
985  for (unsigned Value = 0, NumValues = ValueVTs.size();
986  Value != NumValues; ++Value) {
987  uint64_t BasePartOffset = Offsets[Value];
988 
989  EVT ArgVT = ValueVTs[Value];
990  EVT MemVT = ArgVT;
991  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
992  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
993 
994  if (NumRegs == 1) {
995  // This argument is not split, so the IR type is the memory type.
996  if (ArgVT.isExtended()) {
997  // We have an extended type, like i24, so we should just use the
998  // register type.
999  MemVT = RegisterVT;
1000  } else {
1001  MemVT = ArgVT;
1002  }
1003  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1004  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1005  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1006  // We have a vector value which has been split into a vector with
1007  // the same scalar type, but fewer elements. This should handle
1008  // all the floating-point vector types.
1009  MemVT = RegisterVT;
1010  } else if (ArgVT.isVector() &&
1011  ArgVT.getVectorNumElements() == NumRegs) {
1012  // This arg has been split so that each element is stored in a separate
1013  // register.
1014  MemVT = ArgVT.getScalarType();
1015  } else if (ArgVT.isExtended()) {
1016  // We have an extended type, like i65.
1017  MemVT = RegisterVT;
1018  } else {
1019  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1020  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1021  if (RegisterVT.isInteger()) {
1022  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1023  } else if (RegisterVT.isVector()) {
1024  assert(!RegisterVT.getScalarType().isFloatingPoint());
1025  unsigned NumElements = RegisterVT.getVectorNumElements();
1026  assert(MemoryBits % NumElements == 0);
1027  // This vector type has been split into another vector type with
1028  // a different elements size.
1029  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1030  MemoryBits / NumElements);
1031  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1032  } else {
1033  llvm_unreachable("cannot deduce memory type.");
1034  }
1035  }
1036 
1037  // Convert one element vectors to scalar.
1038  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1039  MemVT = MemVT.getScalarType();
1040 
1041  // Round up vec3/vec5 argument.
1042  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1043  assert(MemVT.getVectorNumElements() == 3 ||
1044  MemVT.getVectorNumElements() == 5);
1045  MemVT = MemVT.getPow2VectorType(State.getContext());
1046  }
1047 
1048  unsigned PartOffset = 0;
1049  for (unsigned i = 0; i != NumRegs; ++i) {
1050  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1051  BasePartOffset + PartOffset,
1052  MemVT.getSimpleVT(),
1054  PartOffset += MemVT.getStoreSize();
1055  }
1056  }
1057  }
1058 }
1059 
1061  SDValue Chain, CallingConv::ID CallConv,
1062  bool isVarArg,
1063  const SmallVectorImpl<ISD::OutputArg> &Outs,
1064  const SmallVectorImpl<SDValue> &OutVals,
1065  const SDLoc &DL, SelectionDAG &DAG) const {
1066  // FIXME: Fails for r600 tests
1067  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1068  // "wave terminate should not have return values");
1069  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1070 }
1071 
1072 //===---------------------------------------------------------------------===//
1073 // Target specific lowering
1074 //===---------------------------------------------------------------------===//
1075 
1076 /// Selects the correct CCAssignFn for a given CallingConvention value.
1078  bool IsVarArg) {
1079  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1080 }
1081 
1083  bool IsVarArg) {
1084  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1085 }
1086 
1088  SelectionDAG &DAG,
1089  MachineFrameInfo &MFI,
1090  int ClobberedFI) const {
1091  SmallVector<SDValue, 8> ArgChains;
1092  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1093  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1094 
1095  // Include the original chain at the beginning of the list. When this is
1096  // used by target LowerCall hooks, this helps legalize find the
1097  // CALLSEQ_BEGIN node.
1098  ArgChains.push_back(Chain);
1099 
1100  // Add a chain value for each stack argument corresponding
1101  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1102  UE = DAG.getEntryNode().getNode()->use_end();
1103  U != UE; ++U) {
1104  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1105  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1106  if (FI->getIndex() < 0) {
1107  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1108  int64_t InLastByte = InFirstByte;
1109  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1110 
1111  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1112  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1113  ArgChains.push_back(SDValue(L, 1));
1114  }
1115  }
1116  }
1117  }
1118 
1119  // Build a tokenfactor for all the chains.
1120  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1121 }
1122 
1124  SmallVectorImpl<SDValue> &InVals,
1125  StringRef Reason) const {
1126  SDValue Callee = CLI.Callee;
1127  SelectionDAG &DAG = CLI.DAG;
1128 
1129  const Function &Fn = DAG.getMachineFunction().getFunction();
1130 
1131  StringRef FuncName("<unknown>");
1132 
1133  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1134  FuncName = G->getSymbol();
1135  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1136  FuncName = G->getGlobal()->getName();
1137 
1138  DiagnosticInfoUnsupported NoCalls(
1139  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1140  DAG.getContext()->diagnose(NoCalls);
1141 
1142  if (!CLI.IsTailCall) {
1143  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1144  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1145  }
1146 
1147  return DAG.getEntryNode();
1148 }
1149 
1151  SmallVectorImpl<SDValue> &InVals) const {
1152  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1153 }
1154 
1156  SelectionDAG &DAG) const {
1157  const Function &Fn = DAG.getMachineFunction().getFunction();
1158 
1159  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1160  SDLoc(Op).getDebugLoc());
1161  DAG.getContext()->diagnose(NoDynamicAlloca);
1162  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1163  return DAG.getMergeValues(Ops, SDLoc());
1164 }
1165 
1167  SelectionDAG &DAG) const {
1168  switch (Op.getOpcode()) {
1169  default:
1170  Op->print(errs(), &DAG);
1171  llvm_unreachable("Custom lowering code for this"
1172  "instruction is not implemented yet!");
1173  break;
1174  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1175  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1176  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1177  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1178  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1179  case ISD::FREM: return LowerFREM(Op, DAG);
1180  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1181  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1182  case ISD::FRINT: return LowerFRINT(Op, DAG);
1183  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1184  case ISD::FROUND: return LowerFROUND(Op, DAG);
1185  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1186  case ISD::FLOG:
1187  return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
1188  case ISD::FLOG10:
1189  return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
1190  case ISD::FEXP:
1191  return lowerFEXP(Op, DAG);
1192  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1193  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1194  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1195  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1196  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1197  case ISD::CTTZ:
1198  case ISD::CTTZ_ZERO_UNDEF:
1199  case ISD::CTLZ:
1200  case ISD::CTLZ_ZERO_UNDEF:
1201  return LowerCTLZ_CTTZ(Op, DAG);
1202  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1203  }
1204  return Op;
1205 }
1206 
1209  SelectionDAG &DAG) const {
1210  switch (N->getOpcode()) {
1212  // Different parts of legalization seem to interpret which type of
1213  // sign_extend_inreg is the one to check for custom lowering. The extended
1214  // from type is what really matters, but some places check for custom
1215  // lowering of the result type. This results in trying to use
1216  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1217  // nothing here and let the illegal result integer be handled normally.
1218  return;
1219  default:
1220  return;
1221  }
1222 }
1223 
1224 static bool hasDefinedInitializer(const GlobalValue *GV) {
1225  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1226  if (!GVar || !GVar->hasInitializer())
1227  return false;
1228 
1229  return !isa<UndefValue>(GVar->getInitializer());
1230 }
1231 
1233  SDValue Op,
1234  SelectionDAG &DAG) const {
1235 
1236  const DataLayout &DL = DAG.getDataLayout();
1237  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1238  const GlobalValue *GV = G->getGlobal();
1239 
1242  if (!MFI->isEntryFunction()) {
1243  const Function &Fn = DAG.getMachineFunction().getFunction();
1244  DiagnosticInfoUnsupported BadLDSDecl(
1245  Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1246  DAG.getContext()->diagnose(BadLDSDecl);
1247  }
1248 
1249  // XXX: What does the value of G->getOffset() mean?
1250  assert(G->getOffset() == 0 &&
1251  "Do not know what to do with an non-zero offset");
1252 
1253  // TODO: We could emit code to handle the initialization somewhere.
1254  if (!hasDefinedInitializer(GV)) {
1255  unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1256  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1257  }
1258  }
1259 
1260  const Function &Fn = DAG.getMachineFunction().getFunction();
1261  DiagnosticInfoUnsupported BadInit(
1262  Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1263  DAG.getContext()->diagnose(BadInit);
1264  return SDValue();
1265 }
1266 
1268  SelectionDAG &DAG) const {
1270 
1271  EVT VT = Op.getValueType();
1272  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1273  SDLoc SL(Op);
1274  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1275  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1276 
1277  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1278  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1279  }
1280 
1281  for (const SDUse &U : Op->ops())
1282  DAG.ExtractVectorElements(U.get(), Args);
1283 
1284  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1285 }
1286 
1288  SelectionDAG &DAG) const {
1289 
1291  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1292  EVT VT = Op.getValueType();
1293  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1294  VT.getVectorNumElements());
1295 
1296  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1297 }
1298 
1299 /// Generate Min/Max node
1301  SDValue LHS, SDValue RHS,
1302  SDValue True, SDValue False,
1303  SDValue CC,
1304  DAGCombinerInfo &DCI) const {
1305  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1306  return SDValue();
1307 
1308  SelectionDAG &DAG = DCI.DAG;
1309  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1310  switch (CCOpcode) {
1311  case ISD::SETOEQ:
1312  case ISD::SETONE:
1313  case ISD::SETUNE:
1314  case ISD::SETNE:
1315  case ISD::SETUEQ:
1316  case ISD::SETEQ:
1317  case ISD::SETFALSE:
1318  case ISD::SETFALSE2:
1319  case ISD::SETTRUE:
1320  case ISD::SETTRUE2:
1321  case ISD::SETUO:
1322  case ISD::SETO:
1323  break;
1324  case ISD::SETULE:
1325  case ISD::SETULT: {
1326  if (LHS == True)
1327  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1328  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1329  }
1330  case ISD::SETOLE:
1331  case ISD::SETOLT:
1332  case ISD::SETLE:
1333  case ISD::SETLT: {
1334  // Ordered. Assume ordered for undefined.
1335 
1336  // Only do this after legalization to avoid interfering with other combines
1337  // which might occur.
1338  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1339  !DCI.isCalledByLegalizer())
1340  return SDValue();
1341 
1342  // We need to permute the operands to get the correct NaN behavior. The
1343  // selected operand is the second one based on the failing compare with NaN,
1344  // so permute it based on the compare type the hardware uses.
1345  if (LHS == True)
1346  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1347  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1348  }
1349  case ISD::SETUGE:
1350  case ISD::SETUGT: {
1351  if (LHS == True)
1352  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1353  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1354  }
1355  case ISD::SETGT:
1356  case ISD::SETGE:
1357  case ISD::SETOGE:
1358  case ISD::SETOGT: {
1359  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1360  !DCI.isCalledByLegalizer())
1361  return SDValue();
1362 
1363  if (LHS == True)
1364  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1365  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1366  }
1367  case ISD::SETCC_INVALID:
1368  llvm_unreachable("Invalid setcc condcode!");
1369  }
1370  return SDValue();
1371 }
1372 
1373 std::pair<SDValue, SDValue>
1375  SDLoc SL(Op);
1376 
1377  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1378 
1379  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1380  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1381 
1382  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1383  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1384 
1385  return std::make_pair(Lo, Hi);
1386 }
1387 
1389  SDLoc SL(Op);
1390 
1391  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1392  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1393  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1394 }
1395 
1397  SDLoc SL(Op);
1398 
1399  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1400  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1401  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1402 }
1403 
1404 // Split a vector type into two parts. The first part is a power of two vector.
1405 // The second part is whatever is left over, and is a scalar if it would
1406 // otherwise be a 1-vector.
1407 std::pair<EVT, EVT>
1409  EVT LoVT, HiVT;
1410  EVT EltVT = VT.getVectorElementType();
1411  unsigned NumElts = VT.getVectorNumElements();
1412  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1413  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1414  HiVT = NumElts - LoNumElts == 1
1415  ? EltVT
1416  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1417  return std::make_pair(LoVT, HiVT);
1418 }
1419 
1420 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1421 // scalar.
1422 std::pair<SDValue, SDValue>
1424  const EVT &LoVT, const EVT &HiVT,
1425  SelectionDAG &DAG) const {
1426  assert(LoVT.getVectorNumElements() +
1427  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1429  "More vector elements requested than available!");
1430  auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1431  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1432  DAG.getConstant(0, DL, IdxTy));
1433  SDValue Hi = DAG.getNode(
1435  HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
1436  return std::make_pair(Lo, Hi);
1437 }
1438 
1440  SelectionDAG &DAG) const {
1441  LoadSDNode *Load = cast<LoadSDNode>(Op);
1442  EVT VT = Op.getValueType();
1443 
1444 
1445  // If this is a 2 element vector, we really want to scalarize and not create
1446  // weird 1 element vectors.
1447  if (VT.getVectorNumElements() == 2)
1448  return scalarizeVectorLoad(Load, DAG);
1449 
1450  SDValue BasePtr = Load->getBasePtr();
1451  EVT MemVT = Load->getMemoryVT();
1452  SDLoc SL(Op);
1453 
1454  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1455 
1456  EVT LoVT, HiVT;
1457  EVT LoMemVT, HiMemVT;
1458  SDValue Lo, Hi;
1459 
1460  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1461  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1462  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1463 
1464  unsigned Size = LoMemVT.getStoreSize();
1465  unsigned BaseAlign = Load->getAlignment();
1466  unsigned HiAlign = MinAlign(BaseAlign, Size);
1467 
1468  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1469  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1470  BaseAlign, Load->getMemOperand()->getFlags());
1471  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1472  SDValue HiLoad =
1473  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1474  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1475  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1476 
1477  auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1478  SDValue Join;
1479  if (LoVT == HiVT) {
1480  // This is the case that the vector is power of two so was evenly split.
1481  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1482  } else {
1483  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1484  DAG.getConstant(0, SL, IdxTy));
1485  Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
1487  SL, VT, Join, HiLoad,
1488  DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
1489  }
1490 
1491  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1492  LoLoad.getValue(1), HiLoad.getValue(1))};
1493 
1494  return DAG.getMergeValues(Ops, SL);
1495 }
1496 
1497 // Widen a vector load from vec3 to vec4.
1499  SelectionDAG &DAG) const {
1500  LoadSDNode *Load = cast<LoadSDNode>(Op);
1501  EVT VT = Op.getValueType();
1502  assert(VT.getVectorNumElements() == 3);
1503  SDValue BasePtr = Load->getBasePtr();
1504  EVT MemVT = Load->getMemoryVT();
1505  SDLoc SL(Op);
1506  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1507  unsigned BaseAlign = Load->getAlignment();
1508 
1509  EVT WideVT =
1510  EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1511  EVT WideMemVT =
1512  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1513  SDValue WideLoad = DAG.getExtLoad(
1514  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1515  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1516  return DAG.getMergeValues(
1517  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1518  DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
1519  WideLoad.getValue(1)},
1520  SL);
1521 }
1522 
1524  SelectionDAG &DAG) const {
1525  StoreSDNode *Store = cast<StoreSDNode>(Op);
1526  SDValue Val = Store->getValue();
1527  EVT VT = Val.getValueType();
1528 
1529  // If this is a 2 element vector, we really want to scalarize and not create
1530  // weird 1 element vectors.
1531  if (VT.getVectorNumElements() == 2)
1532  return scalarizeVectorStore(Store, DAG);
1533 
1534  EVT MemVT = Store->getMemoryVT();
1535  SDValue Chain = Store->getChain();
1536  SDValue BasePtr = Store->getBasePtr();
1537  SDLoc SL(Op);
1538 
1539  EVT LoVT, HiVT;
1540  EVT LoMemVT, HiMemVT;
1541  SDValue Lo, Hi;
1542 
1543  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1544  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1545  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1546 
1547  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1548 
1549  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1550  unsigned BaseAlign = Store->getAlignment();
1551  unsigned Size = LoMemVT.getStoreSize();
1552  unsigned HiAlign = MinAlign(BaseAlign, Size);
1553 
1554  SDValue LoStore =
1555  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1556  Store->getMemOperand()->getFlags());
1557  SDValue HiStore =
1558  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1559  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1560 
1561  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1562 }
1563 
1564 // This is a shortcut for integer division because we have fast i32<->f32
1565 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1566 // float is enough to accurately represent up to a 24-bit signed integer.
1568  bool Sign) const {
1569  SDLoc DL(Op);
1570  EVT VT = Op.getValueType();
1571  SDValue LHS = Op.getOperand(0);
1572  SDValue RHS = Op.getOperand(1);
1573  MVT IntVT = MVT::i32;
1574  MVT FltVT = MVT::f32;
1575 
1576  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1577  if (LHSSignBits < 9)
1578  return SDValue();
1579 
1580  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1581  if (RHSSignBits < 9)
1582  return SDValue();
1583 
1584  unsigned BitSize = VT.getSizeInBits();
1585  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1586  unsigned DivBits = BitSize - SignBits;
1587  if (Sign)
1588  ++DivBits;
1589 
1592 
1593  SDValue jq = DAG.getConstant(1, DL, IntVT);
1594 
1595  if (Sign) {
1596  // char|short jq = ia ^ ib;
1597  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1598 
1599  // jq = jq >> (bitsize - 2)
1600  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1601  DAG.getConstant(BitSize - 2, DL, VT));
1602 
1603  // jq = jq | 0x1
1604  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1605  }
1606 
1607  // int ia = (int)LHS;
1608  SDValue ia = LHS;
1609 
1610  // int ib, (int)RHS;
1611  SDValue ib = RHS;
1612 
1613  // float fa = (float)ia;
1614  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1615 
1616  // float fb = (float)ib;
1617  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1618 
1619  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1620  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1621 
1622  // fq = trunc(fq);
1623  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1624 
1625  // float fqneg = -fq;
1626  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1627 
1628  // float fr = mad(fqneg, fb, fa);
1629  unsigned OpCode = Subtarget->hasFP32Denormals() ?
1631  (unsigned)ISD::FMAD;
1632  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1633 
1634  // int iq = (int)fq;
1635  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1636 
1637  // fr = fabs(fr);
1638  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1639 
1640  // fb = fabs(fb);
1641  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1642 
1643  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1644 
1645  // int cv = fr >= fb;
1646  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1647 
1648  // jq = (cv ? jq : 0);
1649  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1650 
1651  // dst = iq + jq;
1652  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1653 
1654  // Rem needs compensation, it's easier to recompute it
1655  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1656  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1657 
1658  // Truncate to number of bits this divide really is.
1659  if (Sign) {
1660  SDValue InRegSize
1661  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1662  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1663  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1664  } else {
1665  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1666  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1667  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1668  }
1669 
1670  return DAG.getMergeValues({ Div, Rem }, DL);
1671 }
1672 
1674  SelectionDAG &DAG,
1676  SDLoc DL(Op);
1677  EVT VT = Op.getValueType();
1678 
1679  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1680 
1681  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1682 
1683  SDValue One = DAG.getConstant(1, DL, HalfVT);
1684  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1685 
1686  //HiLo split
1687  SDValue LHS = Op.getOperand(0);
1688  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1689  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1690 
1691  SDValue RHS = Op.getOperand(1);
1692  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1693  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1694 
1695  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1696  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1697 
1698  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1699  LHS_Lo, RHS_Lo);
1700 
1701  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1702  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1703 
1704  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1705  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1706  return;
1707  }
1708 
1709  if (isTypeLegal(MVT::i64)) {
1710  // Compute denominator reciprocal.
1711  unsigned FMAD = Subtarget->hasFP32Denormals() ?
1713  (unsigned)ISD::FMAD;
1714 
1715  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1716  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1717  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1718  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1719  Cvt_Lo);
1720  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1721  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1722  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1723  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1724  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1725  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1726  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1727  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1728  Mul1);
1729  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1730  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1731  SDValue Rcp64 = DAG.getBitcast(VT,
1732  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1733 
1734  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1735  SDValue One64 = DAG.getConstant(1, DL, VT);
1736  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1737  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1738 
1739  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1740  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1741  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1742  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1743  Zero);
1744  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1745  One);
1746 
1747  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1748  Mulhi1_Lo, Zero1);
1749  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1750  Mulhi1_Hi, Add1_Lo.getValue(1));
1751  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1752  SDValue Add1 = DAG.getBitcast(VT,
1753  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1754 
1755  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1756  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1757  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1758  Zero);
1759  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1760  One);
1761 
1762  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1763  Mulhi2_Lo, Zero1);
1764  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1765  Mulhi2_Hi, Add1_Lo.getValue(1));
1766  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1767  Zero, Add2_Lo.getValue(1));
1768  SDValue Add2 = DAG.getBitcast(VT,
1769  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1770  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1771 
1772  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1773 
1774  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1775  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1776  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1777  Mul3_Lo, Zero1);
1778  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1779  Mul3_Hi, Sub1_Lo.getValue(1));
1780  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1781  SDValue Sub1 = DAG.getBitcast(VT,
1782  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1783 
1784  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1785  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1786  ISD::SETUGE);
1787  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1788  ISD::SETUGE);
1789  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1790 
1791  // TODO: Here and below portions of the code can be enclosed into if/endif.
1792  // Currently control flow is unconditional and we have 4 selects after
1793  // potential endif to substitute PHIs.
1794 
1795  // if C3 != 0 ...
1796  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1797  RHS_Lo, Zero1);
1798  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1799  RHS_Hi, Sub1_Lo.getValue(1));
1800  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1801  Zero, Sub2_Lo.getValue(1));
1802  SDValue Sub2 = DAG.getBitcast(VT,
1803  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1804 
1805  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1806 
1807  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1808  ISD::SETUGE);
1809  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1810  ISD::SETUGE);
1811  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1812 
1813  // if (C6 != 0)
1814  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1815 
1816  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1817  RHS_Lo, Zero1);
1818  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1819  RHS_Hi, Sub2_Lo.getValue(1));
1820  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1821  Zero, Sub3_Lo.getValue(1));
1822  SDValue Sub3 = DAG.getBitcast(VT,
1823  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1824 
1825  // endif C6
1826  // endif C3
1827 
1828  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1829  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1830 
1831  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1832  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1833 
1834  Results.push_back(Div);
1835  Results.push_back(Rem);
1836 
1837  return;
1838  }
1839 
1840  // r600 expandion.
1841  // Get Speculative values
1842  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1843  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1844 
1845  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1846  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1847  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1848 
1849  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1850  SDValue DIV_Lo = Zero;
1851 
1852  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1853 
1854  for (unsigned i = 0; i < halfBitWidth; ++i) {
1855  const unsigned bitPos = halfBitWidth - i - 1;
1856  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1857  // Get value of high bit
1858  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1859  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1860  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1861 
1862  // Shift
1863  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1864  // Add LHS high bit
1865  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1866 
1867  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1868  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1869 
1870  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1871 
1872  // Update REM
1873  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1874  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1875  }
1876 
1877  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1878  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1879  Results.push_back(DIV);
1880  Results.push_back(REM);
1881 }
1882 
1884  SelectionDAG &DAG) const {
1885  SDLoc DL(Op);
1886  EVT VT = Op.getValueType();
1887 
1888  if (VT == MVT::i64) {
1890  LowerUDIVREM64(Op, DAG, Results);
1891  return DAG.getMergeValues(Results, DL);
1892  }
1893 
1894  if (VT == MVT::i32) {
1895  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1896  return Res;
1897  }
1898 
1899  SDValue Num = Op.getOperand(0);
1900  SDValue Den = Op.getOperand(1);
1901 
1902  // RCP = URECIP(Den) = 2^32 / Den + e
1903  // e is rounding error.
1904  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1905 
1906  // RCP_LO = mul(RCP, Den) */
1907  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1908 
1909  // RCP_HI = mulhu (RCP, Den) */
1910  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1911 
1912  // NEG_RCP_LO = -RCP_LO
1913  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1914  RCP_LO);
1915 
1916  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1917  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1918  NEG_RCP_LO, RCP_LO,
1919  ISD::SETEQ);
1920  // Calculate the rounding error from the URECIP instruction
1921  // E = mulhu(ABS_RCP_LO, RCP)
1922  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1923 
1924  // RCP_A_E = RCP + E
1925  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1926 
1927  // RCP_S_E = RCP - E
1928  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1929 
1930  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1931  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1932  RCP_A_E, RCP_S_E,
1933  ISD::SETEQ);
1934  // Quotient = mulhu(Tmp0, Num)
1935  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1936 
1937  // Num_S_Remainder = Quotient * Den
1938  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1939 
1940  // Remainder = Num - Num_S_Remainder
1941  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1942 
1943  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1944  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1945  DAG.getConstant(-1, DL, VT),
1946  DAG.getConstant(0, DL, VT),
1947  ISD::SETUGE);
1948  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1949  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1950  Num_S_Remainder,
1951  DAG.getConstant(-1, DL, VT),
1952  DAG.getConstant(0, DL, VT),
1953  ISD::SETUGE);
1954  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1955  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1956  Remainder_GE_Zero);
1957 
1958  // Calculate Division result:
1959 
1960  // Quotient_A_One = Quotient + 1
1961  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1962  DAG.getConstant(1, DL, VT));
1963 
1964  // Quotient_S_One = Quotient - 1
1965  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1966  DAG.getConstant(1, DL, VT));
1967 
1968  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1969  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1970  Quotient, Quotient_A_One, ISD::SETEQ);
1971 
1972  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1973  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1974  Quotient_S_One, Div, ISD::SETEQ);
1975 
1976  // Calculate Rem result:
1977 
1978  // Remainder_S_Den = Remainder - Den
1979  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1980 
1981  // Remainder_A_Den = Remainder + Den
1982  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1983 
1984  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1985  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1986  Remainder, Remainder_S_Den, ISD::SETEQ);
1987 
1988  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1989  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1990  Remainder_A_Den, Rem, ISD::SETEQ);
1991  SDValue Ops[2] = {
1992  Div,
1993  Rem
1994  };
1995  return DAG.getMergeValues(Ops, DL);
1996 }
1997 
1999  SelectionDAG &DAG) const {
2000  SDLoc DL(Op);
2001  EVT VT = Op.getValueType();
2002 
2003  SDValue LHS = Op.getOperand(0);
2004  SDValue RHS = Op.getOperand(1);
2005 
2006  SDValue Zero = DAG.getConstant(0, DL, VT);
2007  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2008 
2009  if (VT == MVT::i32) {
2010  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2011  return Res;
2012  }
2013 
2014  if (VT == MVT::i64 &&
2015  DAG.ComputeNumSignBits(LHS) > 32 &&
2016  DAG.ComputeNumSignBits(RHS) > 32) {
2017  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2018 
2019  //HiLo split
2020  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2021  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2022  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2023  LHS_Lo, RHS_Lo);
2024  SDValue Res[2] = {
2025  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2026  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2027  };
2028  return DAG.getMergeValues(Res, DL);
2029  }
2030 
2031  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2032  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2033  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2034  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2035 
2036  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2037  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2038 
2039  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2040  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2041 
2042  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2043  SDValue Rem = Div.getValue(1);
2044 
2045  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2046  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2047 
2048  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2049  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2050 
2051  SDValue Res[2] = {
2052  Div,
2053  Rem
2054  };
2055  return DAG.getMergeValues(Res, DL);
2056 }
2057 
2058 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2060  SDLoc SL(Op);
2061  EVT VT = Op.getValueType();
2062  SDValue X = Op.getOperand(0);
2063  SDValue Y = Op.getOperand(1);
2064 
2065  // TODO: Should this propagate fast-math-flags?
2066 
2067  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2068  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2069  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2070 
2071  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2072 }
2073 
2075  SDLoc SL(Op);
2076  SDValue Src = Op.getOperand(0);
2077 
2078  // result = trunc(src)
2079  // if (src > 0.0 && src != result)
2080  // result += 1.0
2081 
2082  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2083 
2084  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2085  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2086 
2087  EVT SetCCVT =
2089 
2090  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2091  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2092  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2093 
2094  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2095  // TODO: Should this propagate fast-math-flags?
2096  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2097 }
2098 
2100  SelectionDAG &DAG) {
2101  const unsigned FractBits = 52;
2102  const unsigned ExpBits = 11;
2103 
2104  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2105  Hi,
2106  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2107  DAG.getConstant(ExpBits, SL, MVT::i32));
2108  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2109  DAG.getConstant(1023, SL, MVT::i32));
2110 
2111  return Exp;
2112 }
2113 
2115  SDLoc SL(Op);
2116  SDValue Src = Op.getOperand(0);
2117 
2118  assert(Op.getValueType() == MVT::f64);
2119 
2120  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2121  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2122 
2123  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2124 
2125  // Extract the upper half, since this is where we will find the sign and
2126  // exponent.
2127  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2128 
2129  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2130 
2131  const unsigned FractBits = 52;
2132 
2133  // Extract the sign bit.
2134  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2135  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2136 
2137  // Extend back to 64-bits.
2138  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2139  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2140 
2141  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2142  const SDValue FractMask
2143  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2144 
2145  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2146  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2147  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2148 
2149  EVT SetCCVT =
2151 
2152  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2153 
2154  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2155  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2156 
2157  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2158  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2159 
2160  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2161 }
2162 
2164  SDLoc SL(Op);
2165  SDValue Src = Op.getOperand(0);
2166 
2167  assert(Op.getValueType() == MVT::f64);
2168 
2169  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2170  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2171  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2172 
2173  // TODO: Should this propagate fast-math-flags?
2174 
2175  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2176  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2177 
2178  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2179 
2180  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2181  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2182 
2183  EVT SetCCVT =
2185  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2186 
2187  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2188 }
2189 
2191  // FNEARBYINT and FRINT are the same, except in their handling of FP
2192  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2193  // rint, so just treat them as equivalent.
2194  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2195 }
2196 
2197 // XXX - May require not supporting f32 denormals?
2198 
2199 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2200 // compare and vselect end up producing worse code than scalarizing the whole
2201 // operation.
2203  SDLoc SL(Op);
2204  SDValue X = Op.getOperand(0);
2205  EVT VT = Op.getValueType();
2206 
2207  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2208 
2209  // TODO: Should this propagate fast-math-flags?
2210 
2211  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2212 
2213  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2214 
2215  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2216  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2217  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2218 
2219  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2220 
2221  EVT SetCCVT =
2222  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2223 
2224  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2225 
2226  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2227 
2228  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2229 }
2230 
2232  SDLoc SL(Op);
2233  SDValue X = Op.getOperand(0);
2234 
2235  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2236 
2237  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2238  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2239  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2240  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2241  EVT SetCCVT =
2243 
2244  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2245 
2246  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2247 
2248  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2249 
2250  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
2251  MVT::i64);
2252 
2253  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2254  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2255  DAG.getConstant(INT64_C(0x0008000000000000), SL,
2256  MVT::i64),
2257  Exp);
2258 
2259  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2260  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2261  DAG.getConstant(0, SL, MVT::i64), Tmp0,
2262  ISD::SETNE);
2263 
2264  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2265  D, DAG.getConstant(0, SL, MVT::i64));
2266  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2267 
2268  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2269  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2270 
2271  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2272  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2273  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2274 
2275  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2276  ExpEqNegOne,
2277  DAG.getConstantFP(1.0, SL, MVT::f64),
2278  DAG.getConstantFP(0.0, SL, MVT::f64));
2279 
2280  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2281 
2282  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2283  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2284 
2285  return K;
2286 }
2287 
2289  EVT VT = Op.getValueType();
2290 
2291  if (VT == MVT::f32 || VT == MVT::f16)
2292  return LowerFROUND32_16(Op, DAG);
2293 
2294  if (VT == MVT::f64)
2295  return LowerFROUND64(Op, DAG);
2296 
2297  llvm_unreachable("unhandled type");
2298 }
2299 
2301  SDLoc SL(Op);
2302  SDValue Src = Op.getOperand(0);
2303 
2304  // result = trunc(src);
2305  // if (src < 0.0 && src != result)
2306  // result += -1.0.
2307 
2308  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2309 
2310  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2311  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2312 
2313  EVT SetCCVT =
2315 
2316  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2317  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2318  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2319 
2320  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2321  // TODO: Should this propagate fast-math-flags?
2322  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2323 }
2324 
2326  double Log2BaseInverted) const {
2327  EVT VT = Op.getValueType();
2328 
2329  SDLoc SL(Op);
2330  SDValue Operand = Op.getOperand(0);
2331  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2332  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2333 
2334  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2335 }
2336 
2337 // Return M_LOG2E of appropriate type
2338 static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
2339  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
2340  case MVT::f32:
2341  return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
2342  case MVT::f16:
2343  return DAG.getConstantFP(
2344  APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
2345  SL, VT);
2346  case MVT::f64:
2347  return DAG.getConstantFP(
2348  APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
2349  default:
2350  llvm_unreachable("unsupported fp type");
2351  }
2352 }
2353 
2354 // exp2(M_LOG2E_F * f);
2356  EVT VT = Op.getValueType();
2357  SDLoc SL(Op);
2358  SDValue Src = Op.getOperand(0);
2359 
2360  const SDValue K = getLog2EVal(DAG, SL, VT);
2361  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2362  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2363 }
2364 
2365 static bool isCtlzOpc(unsigned Opc) {
2366  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2367 }
2368 
2369 static bool isCttzOpc(unsigned Opc) {
2370  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2371 }
2372 
2374  SDLoc SL(Op);
2375  SDValue Src = Op.getOperand(0);
2376  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2378 
2379  unsigned ISDOpc, NewOpc;
2380  if (isCtlzOpc(Op.getOpcode())) {
2381  ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2382  NewOpc = AMDGPUISD::FFBH_U32;
2383  } else if (isCttzOpc(Op.getOpcode())) {
2384  ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2385  NewOpc = AMDGPUISD::FFBL_B32;
2386  } else
2387  llvm_unreachable("Unexpected OPCode!!!");
2388 
2389 
2390  if (ZeroUndef && Src.getValueType() == MVT::i32)
2391  return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2392 
2393  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2394 
2395  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2396  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2397 
2398  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2399  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2400 
2401  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2402  *DAG.getContext(), MVT::i32);
2403 
2404  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2405  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2406 
2407  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2408  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2409 
2410  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2411  SDValue Add, NewOpr;
2412  if (isCtlzOpc(Op.getOpcode())) {
2413  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2414  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2415  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2416  } else {
2417  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2418  // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2419  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2420  }
2421 
2422  if (!ZeroUndef) {
2423  // Test if the full 64-bit input is zero.
2424 
2425  // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2426  // which we probably don't want.
2427  SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2428  SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2429  SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2430 
2431  // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2432  // with the same cycles, otherwise it is slower.
2433  // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2434  // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2435 
2436  const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2437 
2438  // The instruction returns -1 for 0 input, but the defined intrinsic
2439  // behavior is to return the number of bits.
2440  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2441  SrcIsZero, Bits32, NewOpr);
2442  }
2443 
2444  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2445 }
2446 
2448  bool Signed) const {
2449  // Unsigned
2450  // cul2f(ulong u)
2451  //{
2452  // uint lz = clz(u);
2453  // uint e = (u != 0) ? 127U + 63U - lz : 0;
2454  // u = (u << lz) & 0x7fffffffffffffffUL;
2455  // ulong t = u & 0xffffffffffUL;
2456  // uint v = (e << 23) | (uint)(u >> 40);
2457  // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2458  // return as_float(v + r);
2459  //}
2460  // Signed
2461  // cl2f(long l)
2462  //{
2463  // long s = l >> 63;
2464  // float r = cul2f((l + s) ^ s);
2465  // return s ? -r : r;
2466  //}
2467 
2468  SDLoc SL(Op);
2469  SDValue Src = Op.getOperand(0);
2470  SDValue L = Src;
2471 
2472  SDValue S;
2473  if (Signed) {
2474  const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2475  S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2476 
2477  SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2478  L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2479  }
2480 
2481  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2482  *DAG.getContext(), MVT::f32);
2483 
2484 
2485  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2486  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2487  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2488  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2489 
2490  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2491  SDValue E = DAG.getSelect(SL, MVT::i32,
2492  DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2493  DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2494  ZeroI32);
2495 
2496  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2497  DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2498  DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2499 
2500  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2501  DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2502 
2503  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2504  U, DAG.getConstant(40, SL, MVT::i64));
2505 
2506  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2507  DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2508  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2509 
2510  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2511  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2512  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2513 
2514  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2515 
2516  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2517 
2518  SDValue R = DAG.getSelect(SL, MVT::i32,
2519  RCmp,
2520  One,
2521  DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2522  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2523  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2524 
2525  if (!Signed)
2526  return R;
2527 
2528  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2529  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2530 }
2531 
2533  bool Signed) const {
2534  SDLoc SL(Op);
2535  SDValue Src = Op.getOperand(0);
2536 
2537  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2538 
2540  DAG.getConstant(0, SL, MVT::i32));
2542  DAG.getConstant(1, SL, MVT::i32));
2543 
2544  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2545  SL, MVT::f64, Hi);
2546 
2547  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2548 
2549  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2550  DAG.getConstant(32, SL, MVT::i32));
2551  // TODO: Should this propagate fast-math-flags?
2552  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2553 }
2554 
2556  SelectionDAG &DAG) const {
2557  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2558  "operation should be legal");
2559 
2560  // TODO: Factor out code common with LowerSINT_TO_FP.
2561 
2562  EVT DestVT = Op.getValueType();
2563  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2564  SDLoc DL(Op);
2565  SDValue Src = Op.getOperand(0);
2566 
2567  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2568  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2569  SDValue FPRound =
2570  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2571 
2572  return FPRound;
2573  }
2574 
2575  if (DestVT == MVT::f32)
2576  return LowerINT_TO_FP32(Op, DAG, false);
2577 
2578  assert(DestVT == MVT::f64);
2579  return LowerINT_TO_FP64(Op, DAG, false);
2580 }
2581 
2583  SelectionDAG &DAG) const {
2584  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2585  "operation should be legal");
2586 
2587  // TODO: Factor out code common with LowerUINT_TO_FP.
2588 
2589  EVT DestVT = Op.getValueType();
2590  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2591  SDLoc DL(Op);
2592  SDValue Src = Op.getOperand(0);
2593 
2594  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2595  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2596  SDValue FPRound =
2597  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2598 
2599  return FPRound;
2600  }
2601 
2602  if (DestVT == MVT::f32)
2603  return LowerINT_TO_FP32(Op, DAG, true);
2604 
2605  assert(DestVT == MVT::f64);
2606  return LowerINT_TO_FP64(Op, DAG, true);
2607 }
2608 
2610  bool Signed) const {
2611  SDLoc SL(Op);
2612 
2613  SDValue Src = Op.getOperand(0);
2614 
2615  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2616 
2617  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2618  MVT::f64);
2619  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2620  MVT::f64);
2621  // TODO: Should this propagate fast-math-flags?
2622  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2623 
2624  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2625 
2626 
2627  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2628 
2629  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2630  MVT::i32, FloorMul);
2631  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2632 
2633  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2634 
2635  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2636 }
2637 
2639  SDLoc DL(Op);
2640  SDValue N0 = Op.getOperand(0);
2641 
2642  // Convert to target node to get known bits
2643  if (N0.getValueType() == MVT::f32)
2644  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2645 
2647  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2648  return SDValue();
2649  }
2650 
2652 
2653  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2654  const unsigned ExpMask = 0x7ff;
2655  const unsigned ExpBiasf64 = 1023;
2656  const unsigned ExpBiasf16 = 15;
2657  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2658  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2659  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2660  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2661  DAG.getConstant(32, DL, MVT::i64));
2662  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2663  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2664  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2665  DAG.getConstant(20, DL, MVT::i64));
2666  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2667  DAG.getConstant(ExpMask, DL, MVT::i32));
2668  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2669  // add the f16 bias (15) to get the biased exponent for the f16 format.
2670  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2671  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2672 
2673  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2674  DAG.getConstant(8, DL, MVT::i32));
2675  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2676  DAG.getConstant(0xffe, DL, MVT::i32));
2677 
2678  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2679  DAG.getConstant(0x1ff, DL, MVT::i32));
2680  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2681 
2682  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2683  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2684 
2685  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2686  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2687  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2688  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2689 
2690  // N = M | (E << 12);
2691  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2692  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2693  DAG.getConstant(12, DL, MVT::i32)));
2694 
2695  // B = clamp(1-E, 0, 13);
2696  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2697  One, E);
2698  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2699  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2700  DAG.getConstant(13, DL, MVT::i32));
2701 
2702  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2703  DAG.getConstant(0x1000, DL, MVT::i32));
2704 
2705  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2706  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2707  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2708  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2709 
2710  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2711  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2712  DAG.getConstant(0x7, DL, MVT::i32));
2713  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2714  DAG.getConstant(2, DL, MVT::i32));
2715  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2716  One, Zero, ISD::SETEQ);
2717  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2718  One, Zero, ISD::SETGT);
2719  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2720  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2721 
2722  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2723  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2724  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2725  I, V, ISD::SETEQ);
2726 
2727  // Extract the sign bit.
2728  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2729  DAG.getConstant(16, DL, MVT::i32));
2730  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2731  DAG.getConstant(0x8000, DL, MVT::i32));
2732 
2733  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2734  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2735 }
2736 
2738  SelectionDAG &DAG) const {
2739  SDValue Src = Op.getOperand(0);
2740 
2741  // TODO: Factor out code common with LowerFP_TO_UINT.
2742 
2743  EVT SrcVT = Src.getValueType();
2744  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2745  SDLoc DL(Op);
2746 
2747  SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2748  SDValue FpToInt32 =
2749  DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2750 
2751  return FpToInt32;
2752  }
2753 
2754  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2755  return LowerFP64_TO_INT(Op, DAG, true);
2756 
2757  return SDValue();
2758 }
2759 
2761  SelectionDAG &DAG) const {
2762  SDValue Src = Op.getOperand(0);
2763 
2764  // TODO: Factor out code common with LowerFP_TO_SINT.
2765 
2766  EVT SrcVT = Src.getValueType();
2767  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2768  SDLoc DL(Op);
2769 
2770  SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2771  SDValue FpToInt32 =
2772  DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2773 
2774  return FpToInt32;
2775  }
2776 
2777  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2778  return LowerFP64_TO_INT(Op, DAG, false);
2779 
2780  return SDValue();
2781 }
2782 
2784  SelectionDAG &DAG) const {
2785  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2786  MVT VT = Op.getSimpleValueType();
2787  MVT ScalarVT = VT.getScalarType();
2788 
2789  assert(VT.isVector());
2790 
2791  SDValue Src = Op.getOperand(0);
2792  SDLoc DL(Op);
2793 
2794  // TODO: Don't scalarize on Evergreen?
2795  unsigned NElts = VT.getVectorNumElements();
2797  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2798 
2799  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2800  for (unsigned I = 0; I < NElts; ++I)
2801  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2802 
2803  return DAG.getBuildVector(VT, DL, Args);
2804 }
2805 
2806 //===----------------------------------------------------------------------===//
2807 // Custom DAG optimizations
2808 //===----------------------------------------------------------------------===//
2809 
2810 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2811  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2812 }
2813 
2814 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2815  EVT VT = Op.getValueType();
2816  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2817  // as unsigned 24-bit values.
2819 }
2820 
2821 static SDValue simplifyI24(SDNode *Node24,
2823  SelectionDAG &DAG = DCI.DAG;
2824  SDValue LHS = Node24->getOperand(0);
2825  SDValue RHS = Node24->getOperand(1);
2826 
2827  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2828 
2829  // First try to simplify using GetDemandedBits which allows the operands to
2830  // have other uses, but will only perform simplifications that involve
2831  // bypassing some nodes for this user.
2832  SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
2833  SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
2834  if (DemandedLHS || DemandedRHS)
2835  return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
2836  DemandedLHS ? DemandedLHS : LHS,
2837  DemandedRHS ? DemandedRHS : RHS);
2838 
2839  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2840  // operands if this node is the only user.
2841  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2842  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2843  return SDValue(Node24, 0);
2844  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2845  return SDValue(Node24, 0);
2846 
2847  return SDValue();
2848 }
2849 
2850 template <typename IntTy>
2852  uint32_t Width, const SDLoc &DL) {
2853  if (Width + Offset < 32) {
2854  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2855  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2856  return DAG.getConstant(Result, DL, MVT::i32);
2857  }
2858 
2859  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2860 }
2861 
2862 static bool hasVolatileUser(SDNode *Val) {
2863  for (SDNode *U : Val->uses()) {
2864  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2865  if (M->isVolatile())
2866  return true;
2867  }
2868  }
2869 
2870  return false;
2871 }
2872 
2874  // i32 vectors are the canonical memory type.
2875  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2876  return false;
2877 
2878  if (!VT.isByteSized())
2879  return false;
2880 
2881  unsigned Size = VT.getStoreSize();
2882 
2883  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2884  return false;
2885 
2886  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2887  return false;
2888 
2889  return true;
2890 }
2891 
2892 // Find a load or store from corresponding pattern root.
2893 // Roots may be build_vector, bitconvert or their combinations.
2896  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
2897  return MN;
2898  assert(isa<BuildVectorSDNode>(N));
2899  for (SDValue V : N->op_values())
2900  if (MemSDNode *MN =
2901  dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
2902  return MN;
2903  llvm_unreachable("cannot find MemSDNode in the pattern!");
2904 }
2905 
2907  SelectionDAG &DAG,
2908  SDNode *N,
2909  SDValue Addr,
2910  SDValue &VAddr,
2911  SDValue &Offset,
2912  SDValue &SLC) const {
2913  const GCNSubtarget &ST =
2915  int64_t OffsetVal = 0;
2916 
2917  if (ST.hasFlatInstOffsets() &&
2918  (!ST.hasFlatSegmentOffsetBug() ||
2920  DAG.isBaseWithConstantOffset(Addr)) {
2921  SDValue N0 = Addr.getOperand(0);
2922  SDValue N1 = Addr.getOperand(1);
2923  int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
2924 
2925  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
2926  if ((IsSigned && isInt<12>(COffsetVal)) ||
2927  (!IsSigned && isUInt<11>(COffsetVal))) {
2928  Addr = N0;
2929  OffsetVal = COffsetVal;
2930  }
2931  } else {
2932  if ((IsSigned && isInt<13>(COffsetVal)) ||
2933  (!IsSigned && isUInt<12>(COffsetVal))) {
2934  Addr = N0;
2935  OffsetVal = COffsetVal;
2936  }
2937  }
2938  }
2939 
2940  VAddr = Addr;
2941  Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
2942  SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
2943 
2944  return true;
2945 }
2946 
2947 // Replace load of an illegal type with a store of a bitcast to a friendlier
2948 // type.
2950  DAGCombinerInfo &DCI) const {
2951  if (!DCI.isBeforeLegalize())
2952  return SDValue();
2953 
2954  LoadSDNode *LN = cast<LoadSDNode>(N);
2955  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2956  return SDValue();
2957 
2958  SDLoc SL(N);
2959  SelectionDAG &DAG = DCI.DAG;
2960  EVT VT = LN->getMemoryVT();
2961 
2962  unsigned Size = VT.getStoreSize();
2963  unsigned Align = LN->getAlignment();
2964  if (Align < Size && isTypeLegal(VT)) {
2965  bool IsFast;
2966  unsigned AS = LN->getAddressSpace();
2967 
2968  // Expand unaligned loads earlier than legalization. Due to visitation order
2969  // problems during legalization, the emitted instructions to pack and unpack
2970  // the bytes again are not eliminated in the case of an unaligned copy.
2972  VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2973  if (VT.isVector())
2974  return scalarizeVectorLoad(LN, DAG);
2975 
2976  SDValue Ops[2];
2977  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2978  return DAG.getMergeValues(Ops, SDLoc(N));
2979  }
2980 
2981  if (!IsFast)
2982  return SDValue();
2983  }
2984 
2985  if (!shouldCombineMemoryType(VT))
2986  return SDValue();
2987 
2988  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2989 
2990  SDValue NewLoad
2991  = DAG.getLoad(NewVT, SL, LN->getChain(),
2992  LN->getBasePtr(), LN->getMemOperand());
2993 
2994  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2995  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2996  return SDValue(N, 0);
2997 }
2998 
2999 // Replace store of an illegal type with a store of a bitcast to a friendlier
3000 // type.
3002  DAGCombinerInfo &DCI) const {
3003  if (!DCI.isBeforeLegalize())
3004  return SDValue();
3005 
3006  StoreSDNode *SN = cast<StoreSDNode>(N);
3007  if (SN->isVolatile() || !ISD::isNormalStore(SN))
3008  return SDValue();
3009 
3010  EVT VT = SN->getMemoryVT();
3011  unsigned Size = VT.getStoreSize();
3012 
3013  SDLoc SL(N);
3014  SelectionDAG &DAG = DCI.DAG;
3015  unsigned Align = SN->getAlignment();
3016  if (Align < Size && isTypeLegal(VT)) {
3017  bool IsFast;
3018  unsigned AS = SN->getAddressSpace();
3019 
3020  // Expand unaligned stores earlier than legalization. Due to visitation
3021  // order problems during legalization, the emitted instructions to pack and
3022  // unpack the bytes again are not eliminated in the case of an unaligned
3023  // copy.
3025  VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
3026  if (VT.isVector())
3027  return scalarizeVectorStore(SN, DAG);
3028 
3029  return expandUnalignedStore(SN, DAG);
3030  }
3031 
3032  if (!IsFast)
3033  return SDValue();
3034  }
3035 
3036  if (!shouldCombineMemoryType(VT))
3037  return SDValue();
3038 
3039  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3040  SDValue Val = SN->getValue();
3041 
3042  //DCI.AddToWorklist(Val.getNode());
3043 
3044  bool OtherUses = !Val.hasOneUse();
3045  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3046  if (OtherUses) {
3047  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3048  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3049  }
3050 
3051  return DAG.getStore(SN->getChain(), SL, CastVal,
3052  SN->getBasePtr(), SN->getMemOperand());
3053 }
3054 
3055 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3056 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3057 // issues.
3059  DAGCombinerInfo &DCI) const {
3060  SelectionDAG &DAG = DCI.DAG;
3061  SDValue N0 = N->getOperand(0);
3062 
3063  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3064  // (vt2 (truncate (assertzext vt0:x, vt1)))
3065  if (N0.getOpcode() == ISD::TRUNCATE) {
3066  SDValue N1 = N->getOperand(1);
3067  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3068  SDLoc SL(N);
3069 
3070  SDValue Src = N0.getOperand(0);
3071  EVT SrcVT = Src.getValueType();
3072  if (SrcVT.bitsGE(ExtVT)) {
3073  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3074  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3075  }
3076  }
3077 
3078  return SDValue();
3079 }
3080 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3081 /// binary operation \p Opc to it with the corresponding constant operands.
3083  DAGCombinerInfo &DCI, const SDLoc &SL,
3084  unsigned Opc, SDValue LHS,
3085  uint32_t ValLo, uint32_t ValHi) const {
3086  SelectionDAG &DAG = DCI.DAG;
3087  SDValue Lo, Hi;
3088  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3089 
3090  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3091  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3092 
3093  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3094  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3095 
3096  // Re-visit the ands. It's possible we eliminated one of them and it could
3097  // simplify the vector.
3098  DCI.AddToWorklist(Lo.getNode());
3099  DCI.AddToWorklist(Hi.getNode());
3100 
3101  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3102  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3103 }
3104 
3106  DAGCombinerInfo &DCI) const {
3107  EVT VT = N->getValueType(0);
3108 
3110  if (!RHS)
3111  return SDValue();
3112 
3113  SDValue LHS = N->getOperand(0);
3114  unsigned RHSVal = RHS->getZExtValue();
3115  if (!RHSVal)
3116  return LHS;
3117 
3118  SDLoc SL(N);
3119  SelectionDAG &DAG = DCI.DAG;
3120 
3121  switch (LHS->getOpcode()) {
3122  default:
3123  break;
3124  case ISD::ZERO_EXTEND:
3125  case ISD::SIGN_EXTEND:
3126  case ISD::ANY_EXTEND: {
3127  SDValue X = LHS->getOperand(0);
3128 
3129  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3131  // Prefer build_vector as the canonical form if packed types are legal.
3132  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3133  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3134  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3135  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3136  }
3137 
3138  // shl (ext x) => zext (shl x), if shift does not overflow int
3139  if (VT != MVT::i64)
3140  break;
3141  KnownBits Known = DAG.computeKnownBits(X);
3142  unsigned LZ = Known.countMinLeadingZeros();
3143  if (LZ < RHSVal)
3144  break;
3145  EVT XVT = X.getValueType();
3146  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3147  return DAG.getZExtOrTrunc(Shl, SL, VT);
3148  }
3149  }
3150 
3151  if (VT != MVT::i64)
3152  return SDValue();
3153 
3154  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3155 
3156  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3157  // common case, splitting this into a move and a 32-bit shift is faster and
3158  // the same code size.
3159  if (RHSVal < 32)
3160  return SDValue();
3161 
3162  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3163 
3164  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3165  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3166 
3167  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3168 
3169  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3170  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3171 }
3172 
3174  DAGCombinerInfo &DCI) const {
3175  if (N->getValueType(0) != MVT::i64)
3176  return SDValue();
3177 
3178  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3179  if (!RHS)
3180  return SDValue();
3181 
3182  SelectionDAG &DAG = DCI.DAG;
3183  SDLoc SL(N);
3184  unsigned RHSVal = RHS->getZExtValue();
3185 
3186  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3187  if (RHSVal == 32) {
3188  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3189  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3190  DAG.getConstant(31, SL, MVT::i32));
3191 
3192  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3193  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3194  }
3195 
3196  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3197  if (RHSVal == 63) {
3198  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3199  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3200  DAG.getConstant(31, SL, MVT::i32));
3201  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3202  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3203  }
3204 
3205  return SDValue();
3206 }
3207 
3209  DAGCombinerInfo &DCI) const {
3210  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3211  if (!RHS)
3212  return SDValue();
3213 
3214  EVT VT = N->getValueType(0);
3215  SDValue LHS = N->getOperand(0);
3216  unsigned ShiftAmt = RHS->getZExtValue();
3217  SelectionDAG &DAG = DCI.DAG;
3218  SDLoc SL(N);
3219 
3220  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3221  // this improves the ability to match BFE patterns in isel.
3222  if (LHS.getOpcode() == ISD::AND) {
3223  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3224  if (Mask->getAPIntValue().isShiftedMask() &&
3225  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3226  return DAG.getNode(
3227  ISD::AND, SL, VT,
3228  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3229  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3230  }
3231  }
3232  }
3233 
3234  if (VT != MVT::i64)
3235  return SDValue();
3236 
3237  if (ShiftAmt < 32)
3238  return SDValue();
3239 
3240  // srl i64:x, C for C >= 32
3241  // =>
3242  // build_pair (srl hi_32(x), C - 32), 0
3243  SDValue One = DAG.getConstant(1, SL, MVT::i32);
3244  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3245 
3246  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3247  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3248 
3249  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3250  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3251 
3252  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3253 
3254  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3255 }
3256 
3258  SDNode *N, DAGCombinerInfo &DCI) const {
3259  SDLoc SL(N);
3260  SelectionDAG &DAG = DCI.DAG;
3261  EVT VT = N->getValueType(0);
3262  SDValue Src = N->getOperand(0);
3263 
3264  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3265  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3266  SDValue Vec = Src.getOperand(0);
3267  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3268  SDValue Elt0 = Vec.getOperand(0);
3269  EVT EltVT = Elt0.getValueType();
3270  if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3271  if (EltVT.isFloatingPoint()) {
3272  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3273  EltVT.changeTypeToInteger(), Elt0);
3274  }
3275 
3276  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3277  }
3278  }
3279  }
3280 
3281  // Equivalent of above for accessing the high element of a vector as an
3282  // integer operation.
3283  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3284  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3285  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3286  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3287  SDValue BV = stripBitcast(Src.getOperand(0));
3288  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3289  BV.getValueType().getVectorNumElements() == 2) {
3290  SDValue SrcElt = BV.getOperand(1);
3291  EVT SrcEltVT = SrcElt.getValueType();
3292  if (SrcEltVT.isFloatingPoint()) {
3293  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3294  SrcEltVT.changeTypeToInteger(), SrcElt);
3295  }
3296 
3297  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3298  }
3299  }
3300  }
3301  }
3302 
3303  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3304  //
3305  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3306  // i16 (trunc (srl (i32 (trunc x), K)))
3307  if (VT.getScalarSizeInBits() < 32) {
3308  EVT SrcVT = Src.getValueType();
3309  if (SrcVT.getScalarSizeInBits() > 32 &&
3310  (Src.getOpcode() == ISD::SRL ||
3311  Src.getOpcode() == ISD::SRA ||
3312  Src.getOpcode() == ISD::SHL)) {
3313  SDValue Amt = Src.getOperand(1);
3314  KnownBits Known = DAG.computeKnownBits(Amt);
3315  unsigned Size = VT.getScalarSizeInBits();
3316  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3317  (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3318  EVT MidVT = VT.isVector() ?
3321 
3322  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3323  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3324  Src.getOperand(0));
3325  DCI.AddToWorklist(Trunc.getNode());
3326 
3327  if (Amt.getValueType() != NewShiftVT) {
3328  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3329  DCI.AddToWorklist(Amt.getNode());
3330  }
3331 
3332  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3333  Trunc, Amt);
3334  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3335  }
3336  }
3337  }
3338 
3339  return SDValue();
3340 }
3341 
3342 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3343 // instructions. If we only match on the legalized i64 mul expansion,
3344 // SimplifyDemandedBits will be unable to remove them because there will be
3345 // multiple uses due to the separate mul + mulh[su].
3346 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3347  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3348  if (Size <= 32) {
3349  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3350  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3351  }
3352 
3353  // Because we want to eliminate extension instructions before the
3354  // operation, we need to create a single user here (i.e. not the separate
3355  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3356 
3357  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3358 
3359  SDValue Mul = DAG.getNode(MulOpc, SL,
3360  DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3361 
3362  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3363  Mul.getValue(0), Mul.getValue(1));
3364 }
3365 
3367  DAGCombinerInfo &DCI) const {
3368  EVT VT = N->getValueType(0);
3369 
3370  unsigned Size = VT.getSizeInBits();
3371  if (VT.isVector() || Size > 64)
3372  return SDValue();
3373 
3374  // There are i16 integer mul/mad.
3375  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3376  return SDValue();
3377 
3378  SelectionDAG &DAG = DCI.DAG;
3379  SDLoc DL(N);
3380 
3381  SDValue N0 = N->getOperand(0);
3382  SDValue N1 = N->getOperand(1);
3383 
3384  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3385  // in the source into any_extends if the result of the mul is truncated. Since
3386  // we can assume the high bits are whatever we want, use the underlying value
3387  // to avoid the unknown high bits from interfering.
3388  if (N0.getOpcode() == ISD::ANY_EXTEND)
3389  N0 = N0.getOperand(0);
3390 
3391  if (N1.getOpcode() == ISD::ANY_EXTEND)
3392  N1 = N1.getOperand(0);
3393 
3394  SDValue Mul;
3395 
3396  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3397  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3398  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3399  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3400  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3401  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3402  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3403  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3404  } else {
3405  return SDValue();
3406  }
3407 
3408  // We need to use sext even for MUL_U24, because MUL_U24 is used
3409  // for signed multiply of 8 and 16-bit types.
3410  return DAG.getSExtOrTrunc(Mul, DL, VT);
3411 }
3412 
3414  DAGCombinerInfo &DCI) const {
3415  EVT VT = N->getValueType(0);
3416 
3417  if (!Subtarget->hasMulI24() || VT.isVector())
3418  return SDValue();
3419 
3420  SelectionDAG &DAG = DCI.DAG;
3421  SDLoc DL(N);
3422 
3423  SDValue N0 = N->getOperand(0);
3424  SDValue N1 = N->getOperand(1);
3425 
3426  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3427  return SDValue();
3428 
3429  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3430  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3431 
3432  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3433  DCI.AddToWorklist(Mulhi.getNode());
3434  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3435 }
3436 
3438  DAGCombinerInfo &DCI) const {
3439  EVT VT = N->getValueType(0);
3440 
3441  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3442  return SDValue();
3443 
3444  SelectionDAG &DAG = DCI.DAG;
3445  SDLoc DL(N);
3446 
3447  SDValue N0 = N->getOperand(0);
3448  SDValue N1 = N->getOperand(1);
3449 
3450  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3451  return SDValue();
3452 
3453  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3454  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3455 
3456  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3457  DCI.AddToWorklist(Mulhi.getNode());
3458  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3459 }
3460 
3462  SDNode *N, DAGCombinerInfo &DCI) const {
3463  SelectionDAG &DAG = DCI.DAG;
3464 
3465  // Simplify demanded bits before splitting into multiple users.
3466  if (SDValue V = simplifyI24(N, DCI))
3467  return V;
3468 
3469  SDValue N0 = N->getOperand(0);
3470  SDValue N1 = N->getOperand(1);
3471 
3472  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3473 
3474  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3475  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3476 
3477  SDLoc SL(N);
3478 
3479  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3480  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3481  return DAG.getMergeValues({ MulLo, MulHi }, SL);
3482 }
3483 
3484 static bool isNegativeOne(SDValue Val) {
3485  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3486  return C->isAllOnesValue();
3487  return false;
3488 }
3489 
3490 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3491  SDValue Op,
3492  const SDLoc &DL,
3493  unsigned Opc) const {
3494  EVT VT = Op.getValueType();
3495  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3496  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3497  LegalVT != MVT::i16))
3498  return SDValue();
3499 
3500  if (VT != MVT::i32)
3501  Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3502 
3503  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3504  if (VT != MVT::i32)
3505  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3506 
3507  return FFBX;
3508 }
3509 
3510 // The native instructions return -1 on 0 input. Optimize out a select that
3511 // produces -1 on 0.
3512 //
3513 // TODO: If zero is not undef, we could also do this if the output is compared
3514 // against the bitwidth.
3515 //
3516 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3518  SDValue LHS, SDValue RHS,
3519  DAGCombinerInfo &DCI) const {
3520  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3521  if (!CmpRhs || !CmpRhs->isNullValue())
3522  return SDValue();
3523 
3524  SelectionDAG &DAG = DCI.DAG;
3525  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3526  SDValue CmpLHS = Cond.getOperand(0);
3527 
3528  unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3530 
3531  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3532  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3533  if (CCOpcode == ISD::SETEQ &&
3534  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3535  RHS.getOperand(0) == CmpLHS &&
3536  isNegativeOne(LHS)) {
3537  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3538  }
3539 
3540  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3541  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3542  if (CCOpcode == ISD::SETNE &&
3543  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3544  LHS.getOperand(0) == CmpLHS &&
3545  isNegativeOne(RHS)) {
3546  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3547  }
3548 
3549  return SDValue();
3550 }
3551 
3553  unsigned Op,
3554  const SDLoc &SL,
3555  SDValue Cond,
3556  SDValue N1,
3557  SDValue N2) {
3558  SelectionDAG &DAG = DCI.DAG;
3559  EVT VT = N1.getValueType();
3560 
3561  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3562  N1.getOperand(0), N2.getOperand(0));
3563  DCI.AddToWorklist(NewSelect.getNode());
3564  return DAG.getNode(Op, SL, VT, NewSelect);
3565 }
3566 
3567 // Pull a free FP operation out of a select so it may fold into uses.
3568 //
3569 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3570 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3571 //
3572 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3573 // select c, (fabs x), +k -> fabs (select c, x, k)
3575  SDValue N) {
3576  SelectionDAG &DAG = DCI.DAG;
3577  SDValue Cond = N.getOperand(0);
3578  SDValue LHS = N.getOperand(1);
3579  SDValue RHS = N.getOperand(2);
3580 
3581  EVT VT = N.getValueType();
3582  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3583  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3584  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3585  SDLoc(N), Cond, LHS, RHS);
3586  }
3587 
3588  bool Inv = false;
3589  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3590  std::swap(LHS, RHS);
3591  Inv = true;
3592  }
3593 
3594  // TODO: Support vector constants.
3596  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3597  SDLoc SL(N);
3598  // If one side is an fneg/fabs and the other is a constant, we can push the
3599  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3600  SDValue NewLHS = LHS.getOperand(0);
3601  SDValue NewRHS = RHS;
3602 
3603  // Careful: if the neg can be folded up, don't try to pull it back down.
3604  bool ShouldFoldNeg = true;
3605 
3606  if (NewLHS.hasOneUse()) {
3607  unsigned Opc = NewLHS.getOpcode();
3608  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3609  ShouldFoldNeg = false;
3610  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3611  ShouldFoldNeg = false;
3612  }
3613 
3614  if (ShouldFoldNeg) {
3615  if (LHS.getOpcode() == ISD::FNEG)
3616  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3617  else if (CRHS->isNegative())
3618  return SDValue();
3619 
3620  if (Inv)
3621  std::swap(NewLHS, NewRHS);
3622 
3623  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3624  Cond, NewLHS, NewRHS);
3625  DCI.AddToWorklist(NewSelect.getNode());
3626  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3627  }
3628  }
3629 
3630  return SDValue();
3631 }
3632 
3633 
3635  DAGCombinerInfo &DCI) const {
3636  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3637  return Folded;
3638 
3639  SDValue Cond = N->getOperand(0);
3640  if (Cond.getOpcode() != ISD::SETCC)
3641  return SDValue();
3642 
3643  EVT VT = N->getValueType(0);
3644  SDValue LHS = Cond.getOperand(0);
3645  SDValue RHS = Cond.getOperand(1);
3646  SDValue CC = Cond.getOperand(2);
3647 
3648  SDValue True = N->getOperand(1);
3649  SDValue False = N->getOperand(2);
3650 
3651  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3652  SelectionDAG &DAG = DCI.DAG;
3653  if ((DAG.isConstantValueOfAnyType(True) ||
3654  DAG.isConstantValueOfAnyType(True)) &&
3655  (!DAG.isConstantValueOfAnyType(False) &&
3656  !DAG.isConstantValueOfAnyType(False))) {
3657  // Swap cmp + select pair to move constant to false input.
3658  // This will allow using VOPC cndmasks more often.
3659  // select (setcc x, y), k, x -> select (setcc y, x) x, x
3660 
3661  SDLoc SL(N);
3662  ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3663  LHS.getValueType().isInteger());
3664 
3665  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3666  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3667  }
3668 
3669  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3670  SDValue MinMax
3671  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3672  // Revisit this node so we can catch min3/max3/med3 patterns.
3673  //DCI.AddToWorklist(MinMax.getNode());
3674  return MinMax;
3675  }
3676  }
3677 
3678  // There's no reason to not do this if the condition has other uses.
3679  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3680 }
3681 
3682 static bool isInv2Pi(const APFloat &APF) {
3683  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3684  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3685  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3686 
3687  return APF.bitwiseIsEqual(KF16) ||
3688  APF.bitwiseIsEqual(KF32) ||
3689  APF.bitwiseIsEqual(KF64);
3690 }
3691 
3692 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3693 // additional cost to negate them.
3695  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3696  if (C->isZero() && !C->isNegative())
3697  return true;
3698 
3699  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3700  return true;
3701  }
3702 
3703  return false;
3704 }
3705 
3706 static unsigned inverseMinMax(unsigned Opc) {
3707  switch (Opc) {
3708  case ISD::FMAXNUM:
3709  return ISD::FMINNUM;
3710  case ISD::FMINNUM:
3711  return ISD::FMAXNUM;
3712  case ISD::FMAXNUM_IEEE:
3713  return ISD::FMINNUM_IEEE;
3714  case ISD::FMINNUM_IEEE:
3715  return ISD::FMAXNUM_IEEE;
3717  return AMDGPUISD::FMIN_LEGACY;
3719  return AMDGPUISD::FMAX_LEGACY;
3720  default:
3721  llvm_unreachable("invalid min/max opcode");
3722  }
3723 }
3724 
3726  DAGCombinerInfo &DCI) const {
3727  SelectionDAG &DAG = DCI.DAG;
3728  SDValue N0 = N->getOperand(0);
3729  EVT VT = N->getValueType(0);
3730 
3731  unsigned Opc = N0.getOpcode();
3732 
3733  // If the input has multiple uses and we can either fold the negate down, or
3734  // the other uses cannot, give up. This both prevents unprofitable
3735  // transformations and infinite loops: we won't repeatedly try to fold around
3736  // a negate that has no 'good' form.
3737  if (N0.hasOneUse()) {
3738  // This may be able to fold into the source, but at a code size cost. Don't
3739  // fold if the fold into the user is free.
3740  if (allUsesHaveSourceMods(N, 0))
3741  return SDValue();
3742  } else {
3743  if (fnegFoldsIntoOp(Opc) &&
3745  return SDValue();
3746  }
3747 
3748  SDLoc SL(N);
3749  switch (Opc) {
3750  case ISD::FADD: {
3751  if (!mayIgnoreSignedZero(N0))
3752  return SDValue();
3753 
3754  // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3755  SDValue LHS = N0.getOperand(0);
3756  SDValue RHS = N0.getOperand(1);
3757 
3758  if (LHS.getOpcode() != ISD::FNEG)
3759  LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3760  else
3761  LHS = LHS.getOperand(0);
3762 
3763  if (RHS.getOpcode() != ISD::FNEG)
3764  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3765  else
3766  RHS = RHS.getOperand(0);
3767 
3768  SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3769  if (Res.getOpcode() != ISD::FADD)
3770  return SDValue(); // Op got folded away.
3771  if (!N0.hasOneUse())
3772  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3773  return Res;
3774  }
3775  case ISD::FMUL:
3776  case AMDGPUISD::FMUL_LEGACY: {
3777  // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3778  // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3779  SDValue LHS = N0.getOperand(0);
3780  SDValue RHS = N0.getOperand(1);
3781 
3782  if (LHS.getOpcode() == ISD::FNEG)
3783  LHS = LHS.getOperand(0);
3784  else if (RHS.getOpcode() == ISD::FNEG)
3785  RHS = RHS.getOperand(0);
3786  else
3787  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3788 
3789  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3790  if (Res.getOpcode() != Opc)
3791  return SDValue(); // Op got folded away.
3792  if (!N0.hasOneUse())
3793  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3794  return Res;
3795  }
3796  case ISD::FMA:
3797  case ISD::FMAD: {
3798  if (!mayIgnoreSignedZero(N0))
3799  return SDValue();
3800 
3801  // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3802  SDValue LHS = N0.getOperand(0);
3803  SDValue MHS = N0.getOperand(1);
3804  SDValue RHS = N0.getOperand(2);
3805 
3806  if (LHS.getOpcode() == ISD::FNEG)
3807  LHS = LHS.getOperand(0);
3808  else if (MHS.getOpcode() == ISD::FNEG)
3809  MHS = MHS.getOperand(0);
3810  else
3811  MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3812 
3813  if (RHS.getOpcode() != ISD::FNEG)
3814  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3815  else
3816  RHS = RHS.getOperand(0);
3817 
3818  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3819  if (Res.getOpcode() != Opc)
3820  return SDValue(); // Op got folded away.
3821  if (!N0.hasOneUse())
3822  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3823  return Res;
3824  }
3825  case ISD::FMAXNUM:
3826  case ISD::FMINNUM:
3827  case ISD::FMAXNUM_IEEE:
3828  case ISD::FMINNUM_IEEE:
3830  case AMDGPUISD::FMIN_LEGACY: {
3831  // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3832  // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3833  // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3834  // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3835 
3836  SDValue LHS = N0.getOperand(0);
3837  SDValue RHS = N0.getOperand(1);
3838 
3839  // 0 doesn't have a negated inline immediate.
3840  // TODO: This constant check should be generalized to other operations.
3841  if (isConstantCostlierToNegate(RHS))
3842  return SDValue();
3843 
3844  SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3845  SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3846  unsigned Opposite = inverseMinMax(Opc);
3847 
3848  SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3849  if (Res.getOpcode() != Opposite)
3850  return SDValue(); // Op got folded away.
3851  if (!N0.hasOneUse())
3852  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3853  return Res;
3854  }
3855  case AMDGPUISD::FMED3: {
3856  SDValue Ops[3];
3857  for (unsigned I = 0; I < 3; ++I)
3858  Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3859 
3860  SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3861  if (Res.getOpcode() != AMDGPUISD::FMED3)
3862  return SDValue(); // Op got folded away.
3863  if (!N0.hasOneUse())
3864  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3865  return Res;
3866  }
3867  case ISD::FP_EXTEND:
3868  case ISD::FTRUNC:
3869  case ISD::FRINT:
3870  case ISD::FNEARBYINT: // XXX - Should fround be handled?
3871  case ISD::FSIN:
3872  case ISD::FCANONICALIZE:
3873  case AMDGPUISD::RCP:
3874  case AMDGPUISD::RCP_LEGACY:
3875  case AMDGPUISD::RCP_IFLAG:
3876  case AMDGPUISD::SIN_HW: {
3877  SDValue CvtSrc = N0.getOperand(0);
3878  if (CvtSrc.getOpcode() == ISD::FNEG) {
3879  // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3880  // (fneg (rcp (fneg x))) -> (rcp x)
3881  return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3882  }
3883 
3884  if (!N0.hasOneUse())
3885  return SDValue();
3886 
3887  // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3888  // (fneg (rcp x)) -> (rcp (fneg x))
3889  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3890  return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3891  }
3892  case ISD::FP_ROUND: {
3893  SDValue CvtSrc = N0.getOperand(0);
3894 
3895  if (CvtSrc.getOpcode() == ISD::FNEG) {
3896  // (fneg (fp_round (fneg x))) -> (fp_round x)
3897  return DAG.getNode(ISD::FP_ROUND, SL, VT,
3898  CvtSrc.getOperand(0), N0.getOperand(1));
3899  }
3900 
3901  if (!N0.hasOneUse())
3902  return SDValue();
3903 
3904  // (fneg (fp_round x)) -> (fp_round (fneg x))
3905  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3906  return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3907  }
3908  case ISD::FP16_TO_FP: {
3909  // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3910  // f16, but legalization of f16 fneg ends up pulling it out of the source.
3911  // Put the fneg back as a legal source operation that can be matched later.
3912  SDLoc SL(N);
3913 
3914  SDValue Src = N0.getOperand(0);
3915  EVT SrcVT = Src.getValueType();
3916 
3917  // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3918  SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3919  DAG.getConstant(0x8000, SL, SrcVT));
3920  return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3921  }
3922  default:
3923  return SDValue();
3924  }
3925 }
3926 
3928  DAGCombinerInfo &DCI) const {
3929  SelectionDAG &DAG = DCI.DAG;
3930  SDValue N0 = N->getOperand(0);
3931 
3932  if (!N0.hasOneUse())
3933  return SDValue();
3934 
3935  switch (N0.getOpcode()) {
3936  case ISD::FP16_TO_FP: {
3937  assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3938  SDLoc SL(N);
3939  SDValue Src = N0.getOperand(0);
3940  EVT SrcVT = Src.getValueType();
3941 
3942  // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3943  SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3944  DAG.getConstant(0x7fff, SL, SrcVT));
3945  return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3946  }
3947  default:
3948  return SDValue();
3949  }
3950 }
3951 
3953  DAGCombinerInfo &DCI) const {
3954  const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3955  if (!CFP)
3956  return SDValue();
3957 
3958  // XXX - Should this flush denormals?
3959  const APFloat &Val = CFP->getValueAPF();
3960  APFloat One(Val.getSemantics(), "1.0");
3961  return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3962 }
3963 
3965  DAGCombinerInfo &DCI) const {
3966  SelectionDAG &DAG = DCI.DAG;
3967  SDLoc DL(N);
3968 
3969  switch(N->getOpcode()) {
3970  default:
3971  break;
3972  case ISD::BITCAST: {
3973  EVT DestVT = N->getValueType(0);
3974 
3975  // Push casts through vector builds. This helps avoid emitting a large
3976  // number of copies when materializing floating point vector constants.
3977  //
3978  // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3979  // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3980  if (DestVT.isVector()) {
3981  SDValue Src = N->getOperand(0);
3982  if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3983  EVT SrcVT = Src.getValueType();
3984  unsigned NElts = DestVT.getVectorNumElements();
3985 
3986  if (SrcVT.getVectorNumElements() == NElts) {
3987  EVT DestEltVT = DestVT.getVectorElementType();
3988 
3989  SmallVector<SDValue, 8> CastedElts;
3990  SDLoc SL(N);
3991  for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3992  SDValue Elt = Src.getOperand(I);
3993  CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3994  }
3995 
3996  return DAG.getBuildVector(DestVT, SL, CastedElts);
3997  }
3998  }
3999  }
4000 
4001  if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
4002  break;
4003 
4004  // Fold bitcasts of constants.
4005  //
4006  // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4007  // TODO: Generalize and move to DAGCombiner
4008  SDValue Src = N->getOperand(0);
4009  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4010  if (Src.getValueType() == MVT::i64) {
4011  SDLoc SL(N);
4012  uint64_t CVal = C->getZExtValue();
4014  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4015  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4016  return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4017  }
4018  }
4019 
4020  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4021  const APInt &Val = C->getValueAPF().bitcastToAPInt();
4022  SDLoc SL(N);
4023  uint64_t CVal = Val.getZExtValue();
4025  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4026  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4027 
4028  return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4029  }
4030 
4031  break;
4032  }
4033  case ISD::SHL: {
4035  break;
4036 
4037  return performShlCombine(N, DCI);
4038  }
4039  case ISD::SRL: {
4041  break;
4042 
4043  return performSrlCombine(N, DCI);
4044  }
4045  case ISD::SRA: {
4047  break;
4048 
4049  return performSraCombine(N, DCI);
4050  }
4051  case ISD::TRUNCATE:
4052  return performTruncateCombine(N, DCI);
4053  case ISD::MUL:
4054  return performMulCombine(N, DCI);
4055  case ISD::MULHS:
4056  return performMulhsCombine(N, DCI);
4057  case ISD::MULHU:
4058  return performMulhuCombine(N, DCI);
4059  case AMDGPUISD::MUL_I24:
4060  case AMDGPUISD::MUL_U24:
4061  case AMDGPUISD::MULHI_I24:
4062  case AMDGPUISD::MULHI_U24: {
4063  if (SDValue V = simplifyI24(N, DCI))
4064  return V;
4065  return SDValue();
4066  }
4069  return performMulLoHi24Combine(N, DCI);
4070  case ISD::SELECT:
4071  return performSelectCombine(N, DCI);
4072  case ISD::FNEG:
4073  return performFNegCombine(N, DCI);
4074  case ISD::FABS:
4075  return performFAbsCombine(N, DCI);
4076  case AMDGPUISD::BFE_I32:
4077  case AMDGPUISD::BFE_U32: {
4078  assert(!N->getValueType(0).isVector() &&
4079  "Vector handling of BFE not implemented");
4081  if (!Width)
4082  break;
4083 
4084  uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4085  if (WidthVal == 0)
4086  return DAG.getConstant(0, DL, MVT::i32);
4087 
4089  if (!Offset)
4090  break;
4091 
4092  SDValue BitsFrom = N->getOperand(0);
4093  uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4094 
4095  bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4096 
4097  if (OffsetVal == 0) {
4098  // This is already sign / zero extended, so try to fold away extra BFEs.
4099  unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4100 
4101  unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4102  if (OpSignBits >= SignBits)
4103  return BitsFrom;
4104 
4105  EVT SmallVT =