LLVM  14.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #include "AMDGPUGenCallingConv.inc"
31 
33  "amdgpu-bypass-slow-div",
34  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35  cl::init(true));
36 
37 // Find a larger type to do a load / store of a vector with.
39  unsigned StoreSize = VT.getStoreSizeInBits();
40  if (StoreSize <= 32)
41  return EVT::getIntegerVT(Ctx, StoreSize);
42 
43  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
44  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45 }
46 
49 }
50 
52  // In order for this to be a signed 24-bit value, bit 23, must
53  // be a sign bit.
54  return DAG.ComputeMaxSignificantBits(Op);
55 }
56 
58  const AMDGPUSubtarget &STI)
59  : TargetLowering(TM), Subtarget(&STI) {
60  // Lower floating point store/load to integer store/load to reduce the number
61  // of patterns in tablegen.
64 
67 
70 
73 
76 
79 
82 
85 
88 
91 
94 
97 
100 
103 
106 
109 
112 
115 
118 
121 
124 
127 
128  // There are no 64-bit extloads. These should be done as a 32-bit extload and
129  // an extension to 64-bit.
130  for (MVT VT : MVT::integer_valuetypes()) {
134  }
135 
136  for (MVT VT : MVT::integer_valuetypes()) {
137  if (VT == MVT::i64)
138  continue;
139 
144 
149 
154  }
155 
172  }
173 
181 
188 
195 
198 
201 
204 
207 
210 
213 
216 
219 
222 
225 
228 
231 
234 
237 
240 
243 
246 
249 
252 
255 
258 
261 
266 
271 
279 
282 
285 
290 
295 
298 
306 
311 
314 
315  // This is totally unsupported, just custom lower to produce an error.
317 
318  // Library functions. These default to Expand, but we have instructions
319  // for them.
330 
333 
337 
338 
341 
345 
346  // Expand to fneg + fadd.
348 
391 
395 
396  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
397  for (MVT VT : ScalarIntVTs) {
398  // These should use [SU]DIVREM, so set them to expand
403 
404  // GPU does not have divrem function for signed or unsigned.
407 
408  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
411 
415 
416  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
421  }
422 
423  // The hardware supports 32-bit FSHR, but not FSHL.
425 
426  // The hardware supports 32-bit ROTR, but not ROTL.
430 
433 
442 
447 
452 
453  static const MVT::SimpleValueType VectorIntTypes[] = {
455 
456  for (MVT VT : VectorIntTypes) {
457  // Expand the following operations for the current type by default.
492  }
493 
494  static const MVT::SimpleValueType FloatVectorTypes[] = {
496 
497  for (MVT VT : FloatVectorTypes) {
528  }
529 
530  // This causes using an unrolled select operation rather than expansion with
531  // bit operations. This is in general better, but the alternative using BFI
532  // instructions may be better if the select sources are SGPRs.
535 
538 
541 
544 
547 
550 
551  // There are no libcalls of any kind.
552  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
553  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
554 
556  setJumpIsExpensive(true);
557 
558  // FIXME: This is only partially true. If we have to do vector compares, any
559  // SGPR pair can be a condition register. If we have a uniform condition, we
560  // are better off doing SALU operations, where there is only one SCC. For now,
561  // we don't have a way of knowing during instruction selection if a condition
562  // will be uniform and we always use vector compares. Assume we are using
563  // vector compares until that is fixed.
565 
568 
570 
571  // We want to find all load dependencies for long chains of stores to enable
572  // merging into very wide vectors. The problem is with vectors with > 4
573  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
574  // vectors are a legal type, even though we have to split the loads
575  // usually. When we can more precisely specify load legality per address
576  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
577  // smarter so that they can figure out what to do in 2 iterations without all
578  // N > 4 stores on the same chain.
580 
581  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
582  // about these during lowering.
583  MaxStoresPerMemcpy = 0xffffffff;
584  MaxStoresPerMemmove = 0xffffffff;
585  MaxStoresPerMemset = 0xffffffff;
586 
587  // The expansion for 64-bit division is enormous.
589  addBypassSlowDiv(64, 32);
590 
611 }
612 
614  if (getTargetMachine().Options.NoSignedZerosFPMath)
615  return true;
616 
617  const auto Flags = Op.getNode()->getFlags();
618  if (Flags.hasNoSignedZeros())
619  return true;
620 
621  return false;
622 }
623 
624 //===----------------------------------------------------------------------===//
625 // Target Information
626 //===----------------------------------------------------------------------===//
627 
629 static bool fnegFoldsIntoOp(unsigned Opc) {
630  switch (Opc) {
631  case ISD::FADD:
632  case ISD::FSUB:
633  case ISD::FMUL:
634  case ISD::FMA:
635  case ISD::FMAD:
636  case ISD::FMINNUM:
637  case ISD::FMAXNUM:
638  case ISD::FMINNUM_IEEE:
639  case ISD::FMAXNUM_IEEE:
640  case ISD::FSIN:
641  case ISD::FTRUNC:
642  case ISD::FRINT:
643  case ISD::FNEARBYINT:
644  case ISD::FCANONICALIZE:
645  case AMDGPUISD::RCP:
648  case AMDGPUISD::SIN_HW:
652  case AMDGPUISD::FMED3:
653  // TODO: handle llvm.amdgcn.fma.legacy
654  return true;
655  default:
656  return false;
657  }
658 }
659 
660 /// \p returns true if the operation will definitely need to use a 64-bit
661 /// encoding, and thus will use a VOP3 encoding regardless of the source
662 /// modifiers.
664 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
665  return N->getNumOperands() > 2 || VT == MVT::f64;
666 }
667 
668 // Most FP instructions support source modifiers, but this could be refined
669 // slightly.
671 static bool hasSourceMods(const SDNode *N) {
672  if (isa<MemSDNode>(N))
673  return false;
674 
675  switch (N->getOpcode()) {
676  case ISD::CopyToReg:
677  case ISD::SELECT:
678  case ISD::FDIV:
679  case ISD::FREM:
680  case ISD::INLINEASM:
681  case ISD::INLINEASM_BR:
684 
685  // TODO: Should really be looking at the users of the bitcast. These are
686  // problematic because bitcasts are used to legalize all stores to integer
687  // types.
688  case ISD::BITCAST:
689  return false;
691  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
692  case Intrinsic::amdgcn_interp_p1:
693  case Intrinsic::amdgcn_interp_p2:
694  case Intrinsic::amdgcn_interp_mov:
695  case Intrinsic::amdgcn_interp_p1_f16:
696  case Intrinsic::amdgcn_interp_p2_f16:
697  return false;
698  default:
699  return true;
700  }
701  }
702  default:
703  return true;
704  }
705 }
706 
708  unsigned CostThreshold) {
709  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
710  // it is truly free to use a source modifier in all cases. If there are
711  // multiple users but for each one will necessitate using VOP3, there will be
712  // a code size increase. Try to avoid increasing code size unless we know it
713  // will save on the instruction count.
714  unsigned NumMayIncreaseSize = 0;
715  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
716 
717  // XXX - Should this limit number of uses to check?
718  for (const SDNode *U : N->uses()) {
719  if (!hasSourceMods(U))
720  return false;
721 
722  if (!opMustUseVOP3Encoding(U, VT)) {
723  if (++NumMayIncreaseSize > CostThreshold)
724  return false;
725  }
726  }
727 
728  return true;
729 }
730 
732  ISD::NodeType ExtendKind) const {
733  assert(!VT.isVector() && "only scalar expected");
734 
735  // Round to the next multiple of 32-bits.
736  unsigned Size = VT.getSizeInBits();
737  if (Size <= 32)
738  return MVT::i32;
739  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
740 }
741 
743  return MVT::i32;
744 }
745 
747  return true;
748 }
749 
750 // The backend supports 32 and 64 bit floating point immediates.
751 // FIXME: Why are we reporting vectors of FP immediates as legal?
753  bool ForCodeSize) const {
754  EVT ScalarVT = VT.getScalarType();
755  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
756  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
757 }
758 
759 // We don't want to shrink f64 / f32 constants.
761  EVT ScalarVT = VT.getScalarType();
762  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
763 }
764 
766  ISD::LoadExtType ExtTy,
767  EVT NewVT) const {
768  // TODO: This may be worth removing. Check regression tests for diffs.
769  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
770  return false;
771 
772  unsigned NewSize = NewVT.getStoreSizeInBits();
773 
774  // If we are reducing to a 32-bit load or a smaller multi-dword load,
775  // this is always better.
776  if (NewSize >= 32)
777  return true;
778 
779  EVT OldVT = N->getValueType(0);
780  unsigned OldSize = OldVT.getStoreSizeInBits();
781 
782  MemSDNode *MN = cast<MemSDNode>(N);
783  unsigned AS = MN->getAddressSpace();
784  // Do not shrink an aligned scalar load to sub-dword.
785  // Scalar engine cannot do sub-dword loads.
786  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
789  (isa<LoadSDNode>(N) &&
790  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
792  return false;
793 
794  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
795  // extloads, so doing one requires using a buffer_load. In cases where we
796  // still couldn't use a scalar load, using the wider load shouldn't really
797  // hurt anything.
798 
799  // If the old size already had to be an extload, there's no harm in continuing
800  // to reduce the width.
801  return (OldSize < 32);
802 }
803 
805  const SelectionDAG &DAG,
806  const MachineMemOperand &MMO) const {
807 
808  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
809 
810  if (LoadTy.getScalarType() == MVT::i32)
811  return false;
812 
813  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
814  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
815 
816  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
817  return false;
818 
819  bool Fast = false;
821  CastTy, MMO, &Fast) &&
822  Fast;
823 }
824 
825 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
826 // profitable with the expansion for 64-bit since it's generally good to
827 // speculate things.
828 // FIXME: These should really have the size as a parameter.
830  return true;
831 }
832 
834  return true;
835 }
836 
838  switch (N->getOpcode()) {
839  case ISD::EntryToken:
840  case ISD::TokenFactor:
841  return true;
843  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
844  switch (IntrID) {
845  case Intrinsic::amdgcn_readfirstlane:
846  case Intrinsic::amdgcn_readlane:
847  return true;
848  }
849  return false;
850  }
851  case ISD::LOAD:
852  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
854  return true;
855  return false;
856  }
857  return false;
858 }
859 
861  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
862  NegatibleCost &Cost, unsigned Depth) const {
863 
864  switch (Op.getOpcode()) {
865  case ISD::FMA:
866  case ISD::FMAD: {
867  // Negating a fma is not free if it has users without source mods.
868  if (!allUsesHaveSourceMods(Op.getNode()))
869  return SDValue();
870  break;
871  }
872  default:
873  break;
874  }
875 
876  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
877  ForCodeSize, Cost, Depth);
878 }
879 
880 //===---------------------------------------------------------------------===//
881 // Target Properties
882 //===---------------------------------------------------------------------===//
883 
885  assert(VT.isFloatingPoint());
886 
887  // Packed operations do not have a fabs modifier.
888  return VT == MVT::f32 || VT == MVT::f64 ||
889  (Subtarget->has16BitInsts() && VT == MVT::f16);
890 }
891 
893  assert(VT.isFloatingPoint());
894  // Report this based on the end legalized type.
895  VT = VT.getScalarType();
896  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
897 }
898 
900  unsigned NumElem,
901  unsigned AS) const {
902  return true;
903 }
904 
906  // There are few operations which truly have vector input operands. Any vector
907  // operation is going to involve operations on each component, and a
908  // build_vector will be a copy per element, so it always makes sense to use a
909  // build_vector input in place of the extracted element to avoid a copy into a
910  // super register.
911  //
912  // We should probably only do this if all users are extracts only, but this
913  // should be the common case.
914  return true;
915 }
916 
918  // Truncate is just accessing a subregister.
919 
920  unsigned SrcSize = Source.getSizeInBits();
921  unsigned DestSize = Dest.getSizeInBits();
922 
923  return DestSize < SrcSize && DestSize % 32 == 0 ;
924 }
925 
927  // Truncate is just accessing a subregister.
928 
929  unsigned SrcSize = Source->getScalarSizeInBits();
930  unsigned DestSize = Dest->getScalarSizeInBits();
931 
932  if (DestSize== 16 && Subtarget->has16BitInsts())
933  return SrcSize >= 32;
934 
935  return DestSize < SrcSize && DestSize % 32 == 0;
936 }
937 
938 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
939  unsigned SrcSize = Src->getScalarSizeInBits();
940  unsigned DestSize = Dest->getScalarSizeInBits();
941 
942  if (SrcSize == 16 && Subtarget->has16BitInsts())
943  return DestSize >= 32;
944 
945  return SrcSize == 32 && DestSize == 64;
946 }
947 
948 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
949  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
950  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
951  // this will enable reducing 64-bit operations the 32-bit, which is always
952  // good.
953 
954  if (Src == MVT::i16)
955  return Dest == MVT::i32 ||Dest == MVT::i64 ;
956 
957  return Src == MVT::i32 && Dest == MVT::i64;
958 }
959 
961  return isZExtFree(Val.getValueType(), VT2);
962 }
963 
965  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
966  // limited number of native 64-bit operations. Shrinking an operation to fit
967  // in a single 32-bit register should always be helpful. As currently used,
968  // this is much less general than the name suggests, and is only used in
969  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
970  // not profitable, and may actually be harmful.
971  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
972 }
973 
974 //===---------------------------------------------------------------------===//
975 // TargetLowering Callbacks
976 //===---------------------------------------------------------------------===//
977 
979  bool IsVarArg) {
980  switch (CC) {
988  return CC_AMDGPU;
989  case CallingConv::C:
990  case CallingConv::Fast:
991  case CallingConv::Cold:
992  return CC_AMDGPU_Func;
994  return CC_SI_Gfx;
997  default:
998  report_fatal_error("Unsupported calling convention for call");
999  }
1000 }
1001 
1003  bool IsVarArg) {
1004  switch (CC) {
1007  llvm_unreachable("kernels should not be handled here");
1015  return RetCC_SI_Shader;
1017  return RetCC_SI_Gfx;
1018  case CallingConv::C:
1019  case CallingConv::Fast:
1020  case CallingConv::Cold:
1021  return RetCC_AMDGPU_Func;
1022  default:
1023  report_fatal_error("Unsupported calling convention.");
1024  }
1025 }
1026 
1027 /// The SelectionDAGBuilder will automatically promote function arguments
1028 /// with illegal types. However, this does not work for the AMDGPU targets
1029 /// since the function arguments are stored in memory as these illegal types.
1030 /// In order to handle this properly we need to get the original types sizes
1031 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1032 /// passing them to AnalyzeFormalArguments()
1033 
1034 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1035 /// input values across multiple registers. Each item in the Ins array
1036 /// represents a single value that will be stored in registers. Ins[x].VT is
1037 /// the value type of the value that will be stored in the register, so
1038 /// whatever SDNode we lower the argument to needs to be this type.
1039 ///
1040 /// In order to correctly lower the arguments we need to know the size of each
1041 /// argument. Since Ins[x].VT gives us the size of the register that will
1042 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1043 /// for the original function argument so that we can deduce the correct memory
1044 /// type to use for Ins[x]. In most cases the correct memory type will be
1045 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1046 /// we have a kernel argument of type v8i8, this argument will be split into
1047 /// 8 parts and each part will be represented by its own item in the Ins array.
1048 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1049 /// the argument before it was split. From this, we deduce that the memory type
1050 /// for each individual part is i8. We pass the memory type as LocVT to the
1051 /// calling convention analysis function and the register type (Ins[x].VT) as
1052 /// the ValVT.
1054  CCState &State,
1055  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1056  const MachineFunction &MF = State.getMachineFunction();
1057  const Function &Fn = MF.getFunction();
1058  LLVMContext &Ctx = Fn.getParent()->getContext();
1060  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1061  CallingConv::ID CC = Fn.getCallingConv();
1062 
1063  Align MaxAlign = Align(1);
1064  uint64_t ExplicitArgOffset = 0;
1065  const DataLayout &DL = Fn.getParent()->getDataLayout();
1066 
1067  unsigned InIndex = 0;
1068 
1069  for (const Argument &Arg : Fn.args()) {
1070  const bool IsByRef = Arg.hasByRefAttr();
1071  Type *BaseArgTy = Arg.getType();
1072  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1073  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1074  if (!Alignment)
1075  Alignment = DL.getABITypeAlign(MemArgTy);
1076  MaxAlign = max(Alignment, MaxAlign);
1077  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1078 
1079  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1080  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1081 
1082  // We're basically throwing away everything passed into us and starting over
1083  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1084  // to us as computed in Ins.
1085  //
1086  // We also need to figure out what type legalization is trying to do to get
1087  // the correct memory offsets.
1088 
1089  SmallVector<EVT, 16> ValueVTs;
1091  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1092 
1093  for (unsigned Value = 0, NumValues = ValueVTs.size();
1094  Value != NumValues; ++Value) {
1095  uint64_t BasePartOffset = Offsets[Value];
1096 
1097  EVT ArgVT = ValueVTs[Value];
1098  EVT MemVT = ArgVT;
1099  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1100  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1101 
1102  if (NumRegs == 1) {
1103  // This argument is not split, so the IR type is the memory type.
1104  if (ArgVT.isExtended()) {
1105  // We have an extended type, like i24, so we should just use the
1106  // register type.
1107  MemVT = RegisterVT;
1108  } else {
1109  MemVT = ArgVT;
1110  }
1111  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1112  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1113  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1114  // We have a vector value which has been split into a vector with
1115  // the same scalar type, but fewer elements. This should handle
1116  // all the floating-point vector types.
1117  MemVT = RegisterVT;
1118  } else if (ArgVT.isVector() &&
1119  ArgVT.getVectorNumElements() == NumRegs) {
1120  // This arg has been split so that each element is stored in a separate
1121  // register.
1122  MemVT = ArgVT.getScalarType();
1123  } else if (ArgVT.isExtended()) {
1124  // We have an extended type, like i65.
1125  MemVT = RegisterVT;
1126  } else {
1127  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1128  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1129  if (RegisterVT.isInteger()) {
1130  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1131  } else if (RegisterVT.isVector()) {
1132  assert(!RegisterVT.getScalarType().isFloatingPoint());
1133  unsigned NumElements = RegisterVT.getVectorNumElements();
1134  assert(MemoryBits % NumElements == 0);
1135  // This vector type has been split into another vector type with
1136  // a different elements size.
1137  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1138  MemoryBits / NumElements);
1139  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1140  } else {
1141  llvm_unreachable("cannot deduce memory type.");
1142  }
1143  }
1144 
1145  // Convert one element vectors to scalar.
1146  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1147  MemVT = MemVT.getScalarType();
1148 
1149  // Round up vec3/vec5 argument.
1150  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1151  assert(MemVT.getVectorNumElements() == 3 ||
1152  MemVT.getVectorNumElements() == 5);
1153  MemVT = MemVT.getPow2VectorType(State.getContext());
1154  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1155  MemVT = MemVT.getRoundIntegerType(State.getContext());
1156  }
1157 
1158  unsigned PartOffset = 0;
1159  for (unsigned i = 0; i != NumRegs; ++i) {
1160  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1161  BasePartOffset + PartOffset,
1162  MemVT.getSimpleVT(),
1164  PartOffset += MemVT.getStoreSize();
1165  }
1166  }
1167  }
1168 }
1169 
1171  SDValue Chain, CallingConv::ID CallConv,
1172  bool isVarArg,
1173  const SmallVectorImpl<ISD::OutputArg> &Outs,
1174  const SmallVectorImpl<SDValue> &OutVals,
1175  const SDLoc &DL, SelectionDAG &DAG) const {
1176  // FIXME: Fails for r600 tests
1177  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1178  // "wave terminate should not have return values");
1179  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1180 }
1181 
1182 //===---------------------------------------------------------------------===//
1183 // Target specific lowering
1184 //===---------------------------------------------------------------------===//
1185 
1186 /// Selects the correct CCAssignFn for a given CallingConvention value.
1188  bool IsVarArg) {
1189  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1190 }
1191 
1193  bool IsVarArg) {
1194  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1195 }
1196 
1198  SelectionDAG &DAG,
1199  MachineFrameInfo &MFI,
1200  int ClobberedFI) const {
1201  SmallVector<SDValue, 8> ArgChains;
1202  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1203  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1204 
1205  // Include the original chain at the beginning of the list. When this is
1206  // used by target LowerCall hooks, this helps legalize find the
1207  // CALLSEQ_BEGIN node.
1208  ArgChains.push_back(Chain);
1209 
1210  // Add a chain value for each stack argument corresponding
1211  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1212  if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1213  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1214  if (FI->getIndex() < 0) {
1215  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1216  int64_t InLastByte = InFirstByte;
1217  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1218 
1219  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1220  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1221  ArgChains.push_back(SDValue(L, 1));
1222  }
1223  }
1224  }
1225  }
1226 
1227  // Build a tokenfactor for all the chains.
1228  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1229 }
1230 
1232  SmallVectorImpl<SDValue> &InVals,
1233  StringRef Reason) const {
1234  SDValue Callee = CLI.Callee;
1235  SelectionDAG &DAG = CLI.DAG;
1236 
1237  const Function &Fn = DAG.getMachineFunction().getFunction();
1238 
1239  StringRef FuncName("<unknown>");
1240 
1241  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1242  FuncName = G->getSymbol();
1243  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1244  FuncName = G->getGlobal()->getName();
1245 
1246  DiagnosticInfoUnsupported NoCalls(
1247  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1248  DAG.getContext()->diagnose(NoCalls);
1249 
1250  if (!CLI.IsTailCall) {
1251  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1252  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1253  }
1254 
1255  return DAG.getEntryNode();
1256 }
1257 
1259  SmallVectorImpl<SDValue> &InVals) const {
1260  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1261 }
1262 
1264  SelectionDAG &DAG) const {
1265  const Function &Fn = DAG.getMachineFunction().getFunction();
1266 
1267  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1268  SDLoc(Op).getDebugLoc());
1269  DAG.getContext()->diagnose(NoDynamicAlloca);
1270  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1271  return DAG.getMergeValues(Ops, SDLoc());
1272 }
1273 
1275  SelectionDAG &DAG) const {
1276  switch (Op.getOpcode()) {
1277  default:
1278  Op->print(errs(), &DAG);
1279  llvm_unreachable("Custom lowering code for this "
1280  "instruction is not implemented yet!");
1281  break;
1283  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1285  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1286  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1287  case ISD::FREM: return LowerFREM(Op, DAG);
1288  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1289  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1290  case ISD::FRINT: return LowerFRINT(Op, DAG);
1291  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1292  case ISD::FROUND: return LowerFROUND(Op, DAG);
1293  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1294  case ISD::FLOG:
1295  return LowerFLOG(Op, DAG, numbers::ln2f);
1296  case ISD::FLOG10:
1297  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1298  case ISD::FEXP:
1299  return lowerFEXP(Op, DAG);
1300  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1301  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1302  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1303  case ISD::FP_TO_SINT:
1304  case ISD::FP_TO_UINT:
1305  return LowerFP_TO_INT(Op, DAG);
1306  case ISD::CTTZ:
1307  case ISD::CTTZ_ZERO_UNDEF:
1308  case ISD::CTLZ:
1309  case ISD::CTLZ_ZERO_UNDEF:
1310  return LowerCTLZ_CTTZ(Op, DAG);
1312  }
1313  return Op;
1314 }
1315 
1318  SelectionDAG &DAG) const {
1319  switch (N->getOpcode()) {
1321  // Different parts of legalization seem to interpret which type of
1322  // sign_extend_inreg is the one to check for custom lowering. The extended
1323  // from type is what really matters, but some places check for custom
1324  // lowering of the result type. This results in trying to use
1325  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1326  // nothing here and let the illegal result integer be handled normally.
1327  return;
1328  default:
1329  return;
1330  }
1331 }
1332 
1334  SDValue Op,
1335  SelectionDAG &DAG) const {
1336 
1337  const DataLayout &DL = DAG.getDataLayout();
1338  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1339  const GlobalValue *GV = G->getGlobal();
1340 
1341  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1342  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1343  if (!MFI->isModuleEntryFunction() &&
1344  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1345  SDLoc DL(Op);
1346  const Function &Fn = DAG.getMachineFunction().getFunction();
1347  DiagnosticInfoUnsupported BadLDSDecl(
1348  Fn, "local memory global used by non-kernel function",
1349  DL.getDebugLoc(), DS_Warning);
1350  DAG.getContext()->diagnose(BadLDSDecl);
1351 
1352  // We currently don't have a way to correctly allocate LDS objects that
1353  // aren't directly associated with a kernel. We do force inlining of
1354  // functions that use local objects. However, if these dead functions are
1355  // not eliminated, we don't want a compile time error. Just emit a warning
1356  // and a trap, since there should be no callable path here.
1357  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1358  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1359  Trap, DAG.getRoot());
1360  DAG.setRoot(OutputChain);
1361  return DAG.getUNDEF(Op.getValueType());
1362  }
1363 
1364  // XXX: What does the value of G->getOffset() mean?
1365  assert(G->getOffset() == 0 &&
1366  "Do not know what to do with an non-zero offset");
1367 
1368  // TODO: We could emit code to handle the initialization somewhere.
1369  // We ignore the initializer for now and legalize it to allow selection.
1370  // The initializer will anyway get errored out during assembly emission.
1371  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1372  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1373  }
1374  return SDValue();
1375 }
1376 
1378  SelectionDAG &DAG) const {
1380 
1381  EVT VT = Op.getValueType();
1382  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1383  SDLoc SL(Op);
1384  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1385  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1386 
1387  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1388  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1389  }
1390 
1391  for (const SDUse &U : Op->ops())
1392  DAG.ExtractVectorElements(U.get(), Args);
1393 
1394  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1395 }
1396 
1398  SelectionDAG &DAG) const {
1399 
1401  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1402  EVT VT = Op.getValueType();
1403  EVT SrcVT = Op.getOperand(0).getValueType();
1404 
1405  // For these types, we have some TableGen patterns except if the index is 1
1406  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1407  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1408  Start != 1)
1409  return Op;
1410 
1411  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1412  VT.getVectorNumElements());
1413 
1414  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1415 }
1416 
1417 /// Generate Min/Max node
1420  SDValue True, SDValue False,
1421  SDValue CC,
1422  DAGCombinerInfo &DCI) const {
1423  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1424  return SDValue();
1425 
1426  SelectionDAG &DAG = DCI.DAG;
1427  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1428  switch (CCOpcode) {
1429  case ISD::SETOEQ:
1430  case ISD::SETONE:
1431  case ISD::SETUNE:
1432  case ISD::SETNE:
1433  case ISD::SETUEQ:
1434  case ISD::SETEQ:
1435  case ISD::SETFALSE:
1436  case ISD::SETFALSE2:
1437  case ISD::SETTRUE:
1438  case ISD::SETTRUE2:
1439  case ISD::SETUO:
1440  case ISD::SETO:
1441  break;
1442  case ISD::SETULE:
1443  case ISD::SETULT: {
1444  if (LHS == True)
1445  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1446  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1447  }
1448  case ISD::SETOLE:
1449  case ISD::SETOLT:
1450  case ISD::SETLE:
1451  case ISD::SETLT: {
1452  // Ordered. Assume ordered for undefined.
1453 
1454  // Only do this after legalization to avoid interfering with other combines
1455  // which might occur.
1456  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1457  !DCI.isCalledByLegalizer())
1458  return SDValue();
1459 
1460  // We need to permute the operands to get the correct NaN behavior. The
1461  // selected operand is the second one based on the failing compare with NaN,
1462  // so permute it based on the compare type the hardware uses.
1463  if (LHS == True)
1464  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1465  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1466  }
1467  case ISD::SETUGE:
1468  case ISD::SETUGT: {
1469  if (LHS == True)
1470  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1471  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1472  }
1473  case ISD::SETGT:
1474  case ISD::SETGE:
1475  case ISD::SETOGE:
1476  case ISD::SETOGT: {
1477  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1478  !DCI.isCalledByLegalizer())
1479  return SDValue();
1480 
1481  if (LHS == True)
1482  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1483  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1484  }
1485  case ISD::SETCC_INVALID:
1486  llvm_unreachable("Invalid setcc condcode!");
1487  }
1488  return SDValue();
1489 }
1490 
1491 std::pair<SDValue, SDValue>
1493  SDLoc SL(Op);
1494 
1495  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1496 
1497  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1498  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1499 
1500  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1501  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1502 
1503  return std::make_pair(Lo, Hi);
1504 }
1505 
1507  SDLoc SL(Op);
1508 
1509  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1510  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1511  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1512 }
1513 
1515  SDLoc SL(Op);
1516 
1517  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1518  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1519  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1520 }
1521 
1522 // Split a vector type into two parts. The first part is a power of two vector.
1523 // The second part is whatever is left over, and is a scalar if it would
1524 // otherwise be a 1-vector.
1525 std::pair<EVT, EVT>
1527  EVT LoVT, HiVT;
1528  EVT EltVT = VT.getVectorElementType();
1529  unsigned NumElts = VT.getVectorNumElements();
1530  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1531  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1532  HiVT = NumElts - LoNumElts == 1
1533  ? EltVT
1534  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1535  return std::make_pair(LoVT, HiVT);
1536 }
1537 
1538 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1539 // scalar.
1540 std::pair<SDValue, SDValue>
1542  const EVT &LoVT, const EVT &HiVT,
1543  SelectionDAG &DAG) const {
1544  assert(LoVT.getVectorNumElements() +
1545  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1546  N.getValueType().getVectorNumElements() &&
1547  "More vector elements requested than available!");
1548  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1549  DAG.getVectorIdxConstant(0, DL));
1550  SDValue Hi = DAG.getNode(
1552  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1553  return std::make_pair(Lo, Hi);
1554 }
1555 
1557  SelectionDAG &DAG) const {
1558  LoadSDNode *Load = cast<LoadSDNode>(Op);
1559  EVT VT = Op.getValueType();
1560  SDLoc SL(Op);
1561 
1562 
1563  // If this is a 2 element vector, we really want to scalarize and not create
1564  // weird 1 element vectors.
1565  if (VT.getVectorNumElements() == 2) {
1566  SDValue Ops[2];
1567  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1568  return DAG.getMergeValues(Ops, SL);
1569  }
1570 
1571  SDValue BasePtr = Load->getBasePtr();
1572  EVT MemVT = Load->getMemoryVT();
1573 
1574  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1575 
1576  EVT LoVT, HiVT;
1577  EVT LoMemVT, HiMemVT;
1578  SDValue Lo, Hi;
1579 
1580  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1581  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1582  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1583 
1584  unsigned Size = LoMemVT.getStoreSize();
1585  unsigned BaseAlign = Load->getAlignment();
1586  unsigned HiAlign = MinAlign(BaseAlign, Size);
1587 
1588  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1589  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1590  BaseAlign, Load->getMemOperand()->getFlags());
1591  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1592  SDValue HiLoad =
1593  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1594  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1595  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1596 
1597  SDValue Join;
1598  if (LoVT == HiVT) {
1599  // This is the case that the vector is power of two so was evenly split.
1600  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1601  } else {
1602  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1603  DAG.getVectorIdxConstant(0, SL));
1604  Join = DAG.getNode(
1606  VT, Join, HiLoad,
1607  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1608  }
1609 
1610  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1611  LoLoad.getValue(1), HiLoad.getValue(1))};
1612 
1613  return DAG.getMergeValues(Ops, SL);
1614 }
1615 
1617  SelectionDAG &DAG) const {
1618  LoadSDNode *Load = cast<LoadSDNode>(Op);
1619  EVT VT = Op.getValueType();
1620  SDValue BasePtr = Load->getBasePtr();
1621  EVT MemVT = Load->getMemoryVT();
1622  SDLoc SL(Op);
1623  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1624  unsigned BaseAlign = Load->getAlignment();
1625  unsigned NumElements = MemVT.getVectorNumElements();
1626 
1627  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1628  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1629  if (NumElements != 3 ||
1630  (BaseAlign < 8 &&
1631  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1632  return SplitVectorLoad(Op, DAG);
1633 
1634  assert(NumElements == 3);
1635 
1636  EVT WideVT =
1638  EVT WideMemVT =
1639  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1640  SDValue WideLoad = DAG.getExtLoad(
1641  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1642  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1643  return DAG.getMergeValues(
1644  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1645  DAG.getVectorIdxConstant(0, SL)),
1646  WideLoad.getValue(1)},
1647  SL);
1648 }
1649 
1651  SelectionDAG &DAG) const {
1652  StoreSDNode *Store = cast<StoreSDNode>(Op);
1653  SDValue Val = Store->getValue();
1654  EVT VT = Val.getValueType();
1655 
1656  // If this is a 2 element vector, we really want to scalarize and not create
1657  // weird 1 element vectors.
1658  if (VT.getVectorNumElements() == 2)
1659  return scalarizeVectorStore(Store, DAG);
1660 
1661  EVT MemVT = Store->getMemoryVT();
1662  SDValue Chain = Store->getChain();
1663  SDValue BasePtr = Store->getBasePtr();
1664  SDLoc SL(Op);
1665 
1666  EVT LoVT, HiVT;
1667  EVT LoMemVT, HiMemVT;
1668  SDValue Lo, Hi;
1669 
1670  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1671  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1672  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1673 
1674  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1675 
1676  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1677  unsigned BaseAlign = Store->getAlignment();
1678  unsigned Size = LoMemVT.getStoreSize();
1679  unsigned HiAlign = MinAlign(BaseAlign, Size);
1680 
1681  SDValue LoStore =
1682  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1683  Store->getMemOperand()->getFlags());
1684  SDValue HiStore =
1685  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1686  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1687 
1688  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1689 }
1690 
1691 // This is a shortcut for integer division because we have fast i32<->f32
1692 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1693 // float is enough to accurately represent up to a 24-bit signed integer.
1695  bool Sign) const {
1696  SDLoc DL(Op);
1697  EVT VT = Op.getValueType();
1698  SDValue LHS = Op.getOperand(0);
1699  SDValue RHS = Op.getOperand(1);
1700  MVT IntVT = MVT::i32;
1701  MVT FltVT = MVT::f32;
1702 
1703  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1704  if (LHSSignBits < 9)
1705  return SDValue();
1706 
1707  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1708  if (RHSSignBits < 9)
1709  return SDValue();
1710 
1711  unsigned BitSize = VT.getSizeInBits();
1712  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1713  unsigned DivBits = BitSize - SignBits;
1714  if (Sign)
1715  ++DivBits;
1716 
1719 
1720  SDValue jq = DAG.getConstant(1, DL, IntVT);
1721 
1722  if (Sign) {
1723  // char|short jq = ia ^ ib;
1724  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1725 
1726  // jq = jq >> (bitsize - 2)
1727  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1728  DAG.getConstant(BitSize - 2, DL, VT));
1729 
1730  // jq = jq | 0x1
1731  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1732  }
1733 
1734  // int ia = (int)LHS;
1735  SDValue ia = LHS;
1736 
1737  // int ib, (int)RHS;
1738  SDValue ib = RHS;
1739 
1740  // float fa = (float)ia;
1741  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1742 
1743  // float fb = (float)ib;
1744  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1745 
1746  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1747  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1748 
1749  // fq = trunc(fq);
1750  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1751 
1752  // float fqneg = -fq;
1753  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1754 
1755  MachineFunction &MF = DAG.getMachineFunction();
1757 
1758  // float fr = mad(fqneg, fb, fa);
1759  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1760  (unsigned)ISD::FMA :
1761  !MFI->getMode().allFP32Denormals() ?
1762  (unsigned)ISD::FMAD :
1763  (unsigned)AMDGPUISD::FMAD_FTZ;
1764  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1765 
1766  // int iq = (int)fq;
1767  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1768 
1769  // fr = fabs(fr);
1770  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1771 
1772  // fb = fabs(fb);
1773  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1774 
1775  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1776 
1777  // int cv = fr >= fb;
1778  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1779 
1780  // jq = (cv ? jq : 0);
1781  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1782 
1783  // dst = iq + jq;
1784  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1785 
1786  // Rem needs compensation, it's easier to recompute it
1787  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1788  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1789 
1790  // Truncate to number of bits this divide really is.
1791  if (Sign) {
1792  SDValue InRegSize
1793  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1794  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1795  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1796  } else {
1797  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1798  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1799  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1800  }
1801 
1802  return DAG.getMergeValues({ Div, Rem }, DL);
1803 }
1804 
1806  SelectionDAG &DAG,
1808  SDLoc DL(Op);
1809  EVT VT = Op.getValueType();
1810 
1811  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1812 
1813  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1814 
1815  SDValue One = DAG.getConstant(1, DL, HalfVT);
1816  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1817 
1818  //HiLo split
1819  SDValue LHS = Op.getOperand(0);
1820  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1821  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1822 
1823  SDValue RHS = Op.getOperand(1);
1824  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1825  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1826 
1827  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1829 
1830  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1831  LHS_Lo, RHS_Lo);
1832 
1833  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1834  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1835 
1836  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1837  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1838  return;
1839  }
1840 
1841  if (isTypeLegal(MVT::i64)) {
1842  // The algorithm here is based on ideas from "Software Integer Division",
1843  // Tom Rodeheffer, August 2008.
1844 
1845  MachineFunction &MF = DAG.getMachineFunction();
1847 
1848  // Compute denominator reciprocal.
1849  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1850  (unsigned)ISD::FMA :
1851  !MFI->getMode().allFP32Denormals() ?
1852  (unsigned)ISD::FMAD :
1853  (unsigned)AMDGPUISD::FMAD_FTZ;
1854 
1855  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1856  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1857  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1858  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1859  Cvt_Lo);
1860  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1861  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1862  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1863  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1864  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1865  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1866  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1867  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1868  Mul1);
1869  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1870  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1871  SDValue Rcp64 = DAG.getBitcast(VT,
1872  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1873 
1874  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1875  SDValue One64 = DAG.getConstant(1, DL, VT);
1876  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1877  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1878 
1879  // First round of UNR (Unsigned integer Newton-Raphson).
1880  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1881  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1882  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1883  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1884  Zero);
1885  SDValue Mulhi1_Hi =
1886  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1887  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1888  Mulhi1_Lo, Zero1);
1889  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1890  Mulhi1_Hi, Add1_Lo.getValue(1));
1891  SDValue Add1 = DAG.getBitcast(VT,
1892  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1893 
1894  // Second round of UNR.
1895  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1896  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1897  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1898  Zero);
1899  SDValue Mulhi2_Hi =
1900  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1901  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1902  Mulhi2_Lo, Zero1);
1903  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1904  Mulhi2_Hi, Add2_Lo.getValue(1));
1905  SDValue Add2 = DAG.getBitcast(VT,
1906  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1907 
1908  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1909 
1910  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1911 
1912  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1913  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1914  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1915  Mul3_Lo, Zero1);
1916  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1917  Mul3_Hi, Sub1_Lo.getValue(1));
1918  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1919  SDValue Sub1 = DAG.getBitcast(VT,
1920  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1921 
1922  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1923  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1924  ISD::SETUGE);
1925  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1926  ISD::SETUGE);
1927  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1928 
1929  // TODO: Here and below portions of the code can be enclosed into if/endif.
1930  // Currently control flow is unconditional and we have 4 selects after
1931  // potential endif to substitute PHIs.
1932 
1933  // if C3 != 0 ...
1934  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1935  RHS_Lo, Zero1);
1936  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1937  RHS_Hi, Sub1_Lo.getValue(1));
1938  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1939  Zero, Sub2_Lo.getValue(1));
1940  SDValue Sub2 = DAG.getBitcast(VT,
1941  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1942 
1943  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1944 
1945  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1946  ISD::SETUGE);
1947  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1948  ISD::SETUGE);
1949  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1950 
1951  // if (C6 != 0)
1952  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1953 
1954  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1955  RHS_Lo, Zero1);
1956  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1957  RHS_Hi, Sub2_Lo.getValue(1));
1958  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1959  Zero, Sub3_Lo.getValue(1));
1960  SDValue Sub3 = DAG.getBitcast(VT,
1961  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1962 
1963  // endif C6
1964  // endif C3
1965 
1966  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1967  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1968 
1969  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1970  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1971 
1972  Results.push_back(Div);
1973  Results.push_back(Rem);
1974 
1975  return;
1976  }
1977 
1978  // r600 expandion.
1979  // Get Speculative values
1980  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1981  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1982 
1983  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1984  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1985  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1986 
1987  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1988  SDValue DIV_Lo = Zero;
1989 
1990  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1991 
1992  for (unsigned i = 0; i < halfBitWidth; ++i) {
1993  const unsigned bitPos = halfBitWidth - i - 1;
1994  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1995  // Get value of high bit
1996  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1997  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1998  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1999 
2000  // Shift
2001  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2002  // Add LHS high bit
2003  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2004 
2005  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2006  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2007 
2008  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2009 
2010  // Update REM
2011  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2012  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2013  }
2014 
2015  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2016  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2017  Results.push_back(DIV);
2018  Results.push_back(REM);
2019 }
2020 
2021 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2022  SelectionDAG &DAG) const {
2023  SDLoc DL(Op);
2024  EVT VT = Op.getValueType();
2025 
2026  if (VT == MVT::i64) {
2028  LowerUDIVREM64(Op, DAG, Results);
2029  return DAG.getMergeValues(Results, DL);
2030  }
2031 
2032  if (VT == MVT::i32) {
2033  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2034  return Res;
2035  }
2036 
2037  SDValue X = Op.getOperand(0);
2038  SDValue Y = Op.getOperand(1);
2039 
2040  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2041  // algorithm used here.
2042 
2043  // Initial estimate of inv(y).
2044  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2045 
2046  // One round of UNR.
2047  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2048  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2049  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2050  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2051 
2052  // Quotient/remainder estimate.
2053  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2054  SDValue R =
2055  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2056 
2057  // First quotient/remainder refinement.
2058  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2059  SDValue One = DAG.getConstant(1, DL, VT);
2060  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2061  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2062  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2063  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2064  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2065 
2066  // Second quotient/remainder refinement.
2067  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2068  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2069  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2070  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2071  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2072 
2073  return DAG.getMergeValues({Q, R}, DL);
2074 }
2075 
2077  SelectionDAG &DAG) const {
2078  SDLoc DL(Op);
2079  EVT VT = Op.getValueType();
2080 
2081  SDValue LHS = Op.getOperand(0);
2082  SDValue RHS = Op.getOperand(1);
2083 
2084  SDValue Zero = DAG.getConstant(0, DL, VT);
2085  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2086 
2087  if (VT == MVT::i32) {
2088  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2089  return Res;
2090  }
2091 
2092  if (VT == MVT::i64 &&
2093  DAG.ComputeNumSignBits(LHS) > 32 &&
2094  DAG.ComputeNumSignBits(RHS) > 32) {
2095  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2096 
2097  //HiLo split
2098  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2099  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2100  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2101  LHS_Lo, RHS_Lo);
2102  SDValue Res[2] = {
2103  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2104  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2105  };
2106  return DAG.getMergeValues(Res, DL);
2107  }
2108 
2109  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2110  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2111  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2112  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2113 
2114  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2115  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2116 
2117  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2118  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2119 
2120  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2121  SDValue Rem = Div.getValue(1);
2122 
2123  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2124  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2125 
2126  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2127  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2128 
2129  SDValue Res[2] = {
2130  Div,
2131  Rem
2132  };
2133  return DAG.getMergeValues(Res, DL);
2134 }
2135 
2136 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2138  SDLoc SL(Op);
2139  EVT VT = Op.getValueType();
2140  auto Flags = Op->getFlags();
2141  SDValue X = Op.getOperand(0);
2142  SDValue Y = Op.getOperand(1);
2143 
2144  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2145  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2146  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2147  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2148  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2149 }
2150 
2152  SDLoc SL(Op);
2153  SDValue Src = Op.getOperand(0);
2154 
2155  // result = trunc(src)
2156  // if (src > 0.0 && src != result)
2157  // result += 1.0
2158 
2159  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2160 
2161  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2162  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2163 
2164  EVT SetCCVT =
2166 
2167  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2168  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2169  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2170 
2171  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2172  // TODO: Should this propagate fast-math-flags?
2173  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2174 }
2175 
2177  SelectionDAG &DAG) {
2178  const unsigned FractBits = 52;
2179  const unsigned ExpBits = 11;
2180 
2181  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2182  Hi,
2183  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2184  DAG.getConstant(ExpBits, SL, MVT::i32));
2185  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2186  DAG.getConstant(1023, SL, MVT::i32));
2187 
2188  return Exp;
2189 }
2190 
2192  SDLoc SL(Op);
2193  SDValue Src = Op.getOperand(0);
2194 
2195  assert(Op.getValueType() == MVT::f64);
2196 
2197  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2198 
2199  // Extract the upper half, since this is where we will find the sign and
2200  // exponent.
2201  SDValue Hi = getHiHalf64(Src, DAG);
2202 
2203  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2204 
2205  const unsigned FractBits = 52;
2206 
2207  // Extract the sign bit.
2208  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2209  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2210 
2211  // Extend back to 64-bits.
2212  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2213  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2214 
2215  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2216  const SDValue FractMask
2217  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2218 
2219  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2220  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2221  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2222 
2223  EVT SetCCVT =
2225 
2226  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2227 
2228  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2229  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2230 
2231  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2232  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2233 
2234  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2235 }
2236 
2238  SDLoc SL(Op);
2239  SDValue Src = Op.getOperand(0);
2240 
2241  assert(Op.getValueType() == MVT::f64);
2242 
2243  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2244  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2245  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2246 
2247  // TODO: Should this propagate fast-math-flags?
2248 
2249  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2250  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2251 
2252  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2253 
2254  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2255  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2256 
2257  EVT SetCCVT =
2259  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2260 
2261  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2262 }
2263 
2265  // FNEARBYINT and FRINT are the same, except in their handling of FP
2266  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2267  // rint, so just treat them as equivalent.
2268  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2269 }
2270 
2271 // XXX - May require not supporting f32 denormals?
2272 
2273 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2274 // compare and vselect end up producing worse code than scalarizing the whole
2275 // operation.
2277  SDLoc SL(Op);
2278  SDValue X = Op.getOperand(0);
2279  EVT VT = Op.getValueType();
2280 
2281  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2282 
2283  // TODO: Should this propagate fast-math-flags?
2284 
2285  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2286 
2287  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2288 
2289  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2290  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2291  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2292 
2293  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2294 
2295  EVT SetCCVT =
2296  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2297 
2298  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2299 
2300  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2301 
2302  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2303 }
2304 
2306  SDLoc SL(Op);
2307  SDValue Src = Op.getOperand(0);
2308 
2309  // result = trunc(src);
2310  // if (src < 0.0 && src != result)
2311  // result += -1.0.
2312 
2313  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2314 
2315  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2316  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2317 
2318  EVT SetCCVT =
2320 
2321  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2322  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2323  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2324 
2325  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2326  // TODO: Should this propagate fast-math-flags?
2327  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2328 }
2329 
2331  double Log2BaseInverted) const {
2332  EVT VT = Op.getValueType();
2333 
2334  SDLoc SL(Op);
2335  SDValue Operand = Op.getOperand(0);
2336  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2337  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2338 
2339  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2340 }
2341 
2342 // exp2(M_LOG2E_F * f);
2344  EVT VT = Op.getValueType();
2345  SDLoc SL(Op);
2346  SDValue Src = Op.getOperand(0);
2347 
2348  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2349  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2350  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2351 }
2352 
2353 static bool isCtlzOpc(unsigned Opc) {
2354  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2355 }
2356 
2357 static bool isCttzOpc(unsigned Opc) {
2358  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2359 }
2360 
2362  SDLoc SL(Op);
2363  SDValue Src = Op.getOperand(0);
2364 
2365  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2366  bool Ctlz = isCtlzOpc(Op.getOpcode());
2367  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2368 
2369  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2370  Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2371 
2372  if (Src.getValueType() == MVT::i32) {
2373  // (ctlz hi:lo) -> (umin (ffbh src), 32)
2374  // (cttz hi:lo) -> (umin (ffbl src), 32)
2375  // (ctlz_zero_undef src) -> (ffbh src)
2376  // (cttz_zero_undef src) -> (ffbl src)
2377  SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2378  if (!ZeroUndef) {
2379  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2380  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2381  }
2382  return NewOpr;
2383  }
2384 
2385  SDValue Lo, Hi;
2386  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2387 
2388  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2389  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2390 
2391  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2392  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2393  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2394  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2395 
2396  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2397  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2398  if (Ctlz)
2399  OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2400  else
2401  OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2402 
2403  SDValue NewOpr;
2404  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2405  if (!ZeroUndef) {
2406  const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2407  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2408  }
2409 
2410  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2411 }
2412 
2414  bool Signed) const {
2415  // The regular method converting a 64-bit integer to float roughly consists of
2416  // 2 steps: normalization and rounding. In fact, after normalization, the
2417  // conversion from a 64-bit integer to a float is essentially the same as the
2418  // one from a 32-bit integer. The only difference is that it has more
2419  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2420  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2421  // converted into the correct float number. The basic steps for the unsigned
2422  // conversion are illustrated in the following pseudo code:
2423  //
2424  // f32 uitofp(i64 u) {
2425  // i32 hi, lo = split(u);
2426  // // Only count the leading zeros in hi as we have native support of the
2427  // // conversion from i32 to f32. If hi is all 0s, the conversion is
2428  // // reduced to a 32-bit one automatically.
2429  // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2430  // u <<= shamt;
2431  // hi, lo = split(u);
2432  // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2433  // // convert it as a 32-bit integer and scale the result back.
2434  // return uitofp(hi) * 2^(32 - shamt);
2435  // }
2436  //
2437  // The signed one follows the same principle but uses 'ffbh_i32' to count its
2438  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2439  // converted instead followed by negation based its sign bit.
2440 
2441  SDLoc SL(Op);
2442  SDValue Src = Op.getOperand(0);
2443 
2444  SDValue Lo, Hi;
2445  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2446  SDValue Sign;
2447  SDValue ShAmt;
2448  if (Signed && Subtarget->isGCN()) {
2449  // We also need to consider the sign bit in Lo if Hi has just sign bits,
2450  // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2451  // account. That is, the maximal shift is
2452  // - 32 if Lo and Hi have opposite signs;
2453  // - 33 if Lo and Hi have the same sign.
2454  //
2455  // Or, MaxShAmt = 33 + OppositeSign, where
2456  //
2457  // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2458  // - -1 if Lo and Hi have opposite signs; and
2459  // - 0 otherwise.
2460  //
2461  // All in all, ShAmt is calculated as
2462  //
2463  // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2464  //
2465  // or
2466  //
2467  // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2468  //
2469  // to reduce the critical path.
2470  SDValue OppositeSign = DAG.getNode(
2471  ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2472  DAG.getConstant(31, SL, MVT::i32));
2473  SDValue MaxShAmt =
2474  DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2475  OppositeSign);
2476  // Count the leading sign bits.
2477  ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2478  // Different from unsigned conversion, the shift should be one bit less to
2479  // preserve the sign bit.
2480  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2481  DAG.getConstant(1, SL, MVT::i32));
2482  ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2483  } else {
2484  if (Signed) {
2485  // Without 'ffbh_i32', only leading zeros could be counted. Take the
2486  // absolute value first.
2487  Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2488  DAG.getConstant(63, SL, MVT::i64));
2489  SDValue Abs =
2490  DAG.getNode(ISD::XOR, SL, MVT::i64,
2491  DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2492  std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2493  }
2494  // Count the leading zeros.
2495  ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2496  // The shift amount for signed integers is [0, 32].
2497  }
2498  // Normalize the given 64-bit integer.
2499  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2500  // Split it again.
2501  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2502  // Calculate the adjust bit for rounding.
2503  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2504  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2505  DAG.getConstant(1, SL, MVT::i32), Lo);
2506  // Get the 32-bit normalized integer.
2507  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2508  // Convert the normalized 32-bit integer into f32.
2509  unsigned Opc =
2510  (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2511  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2512 
2513  // Finally, need to scale back the converted floating number as the original
2514  // 64-bit integer is converted as a 32-bit one.
2515  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2516  ShAmt);
2517  // On GCN, use LDEXP directly.
2518  if (Subtarget->isGCN())
2519  return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2520 
2521  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2522  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2523  // exponent is enough to avoid overflowing into the sign bit.
2524  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2525  DAG.getConstant(23, SL, MVT::i32));
2526  SDValue IVal =
2527  DAG.getNode(ISD::ADD, SL, MVT::i32,
2528  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2529  if (Signed) {
2530  // Set the sign bit.
2531  Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2532  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2533  DAG.getConstant(31, SL, MVT::i32));
2534  IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2535  }
2536  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2537 }
2538 
2540  bool Signed) const {
2541  SDLoc SL(Op);
2542  SDValue Src = Op.getOperand(0);
2543 
2544  SDValue Lo, Hi;
2545  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2546 
2548  SL, MVT::f64, Hi);
2549 
2550  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2551 
2552  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2553  DAG.getConstant(32, SL, MVT::i32));
2554  // TODO: Should this propagate fast-math-flags?
2555  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2556 }
2557 
2559  SelectionDAG &DAG) const {
2560  // TODO: Factor out code common with LowerSINT_TO_FP.
2561  EVT DestVT = Op.getValueType();
2562  SDValue Src = Op.getOperand(0);
2563  EVT SrcVT = Src.getValueType();
2564 
2565  if (SrcVT == MVT::i16) {
2566  if (DestVT == MVT::f16)
2567  return Op;
2568  SDLoc DL(Op);
2569 
2570  // Promote src to i32
2572  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2573  }
2574 
2575  assert(SrcVT == MVT::i64 && "operation should be legal");
2576 
2577  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2578  SDLoc DL(Op);
2579 
2580  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2581  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2582  SDValue FPRound =
2583  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2584 
2585  return FPRound;
2586  }
2587 
2588  if (DestVT == MVT::f32)
2589  return LowerINT_TO_FP32(Op, DAG, false);
2590 
2591  assert(DestVT == MVT::f64);
2592  return LowerINT_TO_FP64(Op, DAG, false);
2593 }
2594 
2596  SelectionDAG &DAG) const {
2597  EVT DestVT = Op.getValueType();
2598 
2599  SDValue Src = Op.getOperand(0);
2600  EVT SrcVT = Src.getValueType();
2601 
2602  if (SrcVT == MVT::i16) {
2603  if (DestVT == MVT::f16)
2604  return Op;
2605 
2606  SDLoc DL(Op);
2607  // Promote src to i32
2609  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2610  }
2611 
2612  assert(SrcVT == MVT::i64 && "operation should be legal");
2613 
2614  // TODO: Factor out code common with LowerUINT_TO_FP.
2615 
2616  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2617  SDLoc DL(Op);
2618  SDValue Src = Op.getOperand(0);
2619 
2620  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2621  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2622  SDValue FPRound =
2623  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2624 
2625  return FPRound;
2626  }
2627 
2628  if (DestVT == MVT::f32)
2629  return LowerINT_TO_FP32(Op, DAG, true);
2630 
2631  assert(DestVT == MVT::f64);
2632  return LowerINT_TO_FP64(Op, DAG, true);
2633 }
2634 
2636  bool Signed) const {
2637  SDLoc SL(Op);
2638 
2639  SDValue Src = Op.getOperand(0);
2640  EVT SrcVT = Src.getValueType();
2641 
2642  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2643 
2644  // The basic idea of converting a floating point number into a pair of 32-bit
2645  // integers is illustrated as follows:
2646  //
2647  // tf := trunc(val);
2648  // hif := floor(tf * 2^-32);
2649  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2650  // hi := fptoi(hif);
2651  // lo := fptoi(lof);
2652  //
2653  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2654  SDValue Sign;
2655  if (Signed && SrcVT == MVT::f32) {
2656  // However, a 32-bit floating point number has only 23 bits mantissa and
2657  // it's not enough to hold all the significant bits of `lof` if val is
2658  // negative. To avoid the loss of precision, We need to take the absolute
2659  // value after truncating and flip the result back based on the original
2660  // signedness.
2661  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2662  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2663  DAG.getConstant(31, SL, MVT::i32));
2664  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2665  }
2666 
2667  SDValue K0, K1;
2668  if (SrcVT == MVT::f64) {
2669  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2670  SL, SrcVT);
2671  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2672  SL, SrcVT);
2673  } else {
2674  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2675  SrcVT);
2676  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2677  SrcVT);
2678  }
2679  // TODO: Should this propagate fast-math-flags?
2680  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2681 
2682  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2683 
2684  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2685 
2686  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2687  : ISD::FP_TO_UINT,
2688  SL, MVT::i32, FloorMul);
2689  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2690 
2691  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2692  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2693 
2694  if (Signed && SrcVT == MVT::f32) {
2695  assert(Sign);
2696  // Flip the result based on the signedness, which is either all 0s or 1s.
2697  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2698  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2699  // r := xor(r, sign) - sign;
2700  Result =
2701  DAG.getNode(ISD::SUB, SL, MVT::i64,
2702  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2703  }
2704 
2705  return Result;
2706 }
2707 
2709  SDLoc DL(Op);
2710  SDValue N0 = Op.getOperand(0);
2711 
2712  // Convert to target node to get known bits
2713  if (N0.getValueType() == MVT::f32)
2714  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2715 
2716  if (getTargetMachine().Options.UnsafeFPMath) {
2717  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2718  return SDValue();
2719  }
2720 
2722 
2723  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2724  const unsigned ExpMask = 0x7ff;
2725  const unsigned ExpBiasf64 = 1023;
2726  const unsigned ExpBiasf16 = 15;
2727  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2728  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2729  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2730  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2731  DAG.getConstant(32, DL, MVT::i64));
2732  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2733  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2734  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2735  DAG.getConstant(20, DL, MVT::i64));
2736  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2737  DAG.getConstant(ExpMask, DL, MVT::i32));
2738  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2739  // add the f16 bias (15) to get the biased exponent for the f16 format.
2740  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2741  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2742 
2743  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2744  DAG.getConstant(8, DL, MVT::i32));
2745  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2746  DAG.getConstant(0xffe, DL, MVT::i32));
2747 
2748  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2749  DAG.getConstant(0x1ff, DL, MVT::i32));
2750  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2751 
2752  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2753  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2754 
2755  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2756  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2757  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2758  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2759 
2760  // N = M | (E << 12);
2761  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2762  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2763  DAG.getConstant(12, DL, MVT::i32)));
2764 
2765  // B = clamp(1-E, 0, 13);
2766  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2767  One, E);
2768  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2769  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2770  DAG.getConstant(13, DL, MVT::i32));
2771 
2772  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2773  DAG.getConstant(0x1000, DL, MVT::i32));
2774 
2775  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2776  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2777  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2778  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2779 
2780  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2781  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2782  DAG.getConstant(0x7, DL, MVT::i32));
2783  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2784  DAG.getConstant(2, DL, MVT::i32));
2785  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2786  One, Zero, ISD::SETEQ);
2787  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2788  One, Zero, ISD::SETGT);
2789  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2790  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2791 
2792  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2793  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2794  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2795  I, V, ISD::SETEQ);
2796 
2797  // Extract the sign bit.
2798  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2799  DAG.getConstant(16, DL, MVT::i32));
2800  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2801  DAG.getConstant(0x8000, DL, MVT::i32));
2802 
2803  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2804  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2805 }
2806 
2808  SelectionDAG &DAG) const {
2809  SDValue Src = Op.getOperand(0);
2810  unsigned OpOpcode = Op.getOpcode();
2811  EVT SrcVT = Src.getValueType();
2812  EVT DestVT = Op.getValueType();
2813 
2814  // Will be selected natively
2815  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2816  return Op;
2817 
2818  // Promote i16 to i32
2819  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2820  SDLoc DL(Op);
2821 
2822  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2823  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2824  }
2825 
2826  if (SrcVT == MVT::f16 ||
2827  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2828  SDLoc DL(Op);
2829 
2830  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2831  unsigned Ext =
2833  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2834  }
2835 
2836  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2837  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2838 
2839  return SDValue();
2840 }
2841 
2843  SelectionDAG &DAG) const {
2844  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2845  MVT VT = Op.getSimpleValueType();
2846  MVT ScalarVT = VT.getScalarType();
2847 
2848  assert(VT.isVector());
2849 
2850  SDValue Src = Op.getOperand(0);
2851  SDLoc DL(Op);
2852 
2853  // TODO: Don't scalarize on Evergreen?
2854  unsigned NElts = VT.getVectorNumElements();
2856  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2857 
2858  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2859  for (unsigned I = 0; I < NElts; ++I)
2860  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2861 
2862  return DAG.getBuildVector(VT, DL, Args);
2863 }
2864 
2865 //===----------------------------------------------------------------------===//
2866 // Custom DAG optimizations
2867 //===----------------------------------------------------------------------===//
2868 
2869 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2870  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2871 }
2872 
2873 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2874  EVT VT = Op.getValueType();
2875  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2876  // as unsigned 24-bit values.
2878 }
2879 
2882  SelectionDAG &DAG = DCI.DAG;
2883  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2884  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2885 
2886  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2887  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2888  unsigned NewOpcode = Node24->getOpcode();
2889  if (IsIntrin) {
2890  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2891  switch (IID) {
2892  case Intrinsic::amdgcn_mul_i24:
2893  NewOpcode = AMDGPUISD::MUL_I24;
2894  break;
2895  case Intrinsic::amdgcn_mul_u24:
2896  NewOpcode = AMDGPUISD::MUL_U24;
2897  break;
2898  case Intrinsic::amdgcn_mulhi_i24:
2899  NewOpcode = AMDGPUISD::MULHI_I24;
2900  break;
2901  case Intrinsic::amdgcn_mulhi_u24:
2902  NewOpcode = AMDGPUISD::MULHI_U24;
2903  break;
2904  default:
2905  llvm_unreachable("Expected 24-bit mul intrinsic");
2906  }
2907  }
2908 
2909  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2910 
2911  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2912  // the operands to have other uses, but will only perform simplifications that
2913  // involve bypassing some nodes for this user.
2914  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2915  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2916  if (DemandedLHS || DemandedRHS)
2917  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2918  DemandedLHS ? DemandedLHS : LHS,
2919  DemandedRHS ? DemandedRHS : RHS);
2920 
2921  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2922  // operands if this node is the only user.
2923  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2924  return SDValue(Node24, 0);
2925  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2926  return SDValue(Node24, 0);
2927 
2928  return SDValue();
2929 }
2930 
2931 template <typename IntTy>
2933  uint32_t Width, const SDLoc &DL) {
2934  if (Width + Offset < 32) {
2935  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2936  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2937  return DAG.getConstant(Result, DL, MVT::i32);
2938  }
2939 
2940  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2941 }
2942 
2943 static bool hasVolatileUser(SDNode *Val) {
2944  for (SDNode *U : Val->uses()) {
2945  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2946  if (M->isVolatile())
2947  return true;
2948  }
2949  }
2950 
2951  return false;
2952 }
2953 
2955  // i32 vectors are the canonical memory type.
2956  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2957  return false;
2958 
2959  if (!VT.isByteSized())
2960  return false;
2961 
2962  unsigned Size = VT.getStoreSize();
2963 
2964  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2965  return false;
2966 
2967  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2968  return false;
2969 
2970  return true;
2971 }
2972 
2973 // Replace load of an illegal type with a store of a bitcast to a friendlier
2974 // type.
2976  DAGCombinerInfo &DCI) const {
2977  if (!DCI.isBeforeLegalize())
2978  return SDValue();
2979 
2980  LoadSDNode *LN = cast<LoadSDNode>(N);
2981  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2982  return SDValue();
2983 
2984  SDLoc SL(N);
2985  SelectionDAG &DAG = DCI.DAG;
2986  EVT VT = LN->getMemoryVT();
2987 
2988  unsigned Size = VT.getStoreSize();
2989  Align Alignment = LN->getAlign();
2990  if (Alignment < Size && isTypeLegal(VT)) {
2991  bool IsFast;
2992  unsigned AS = LN->getAddressSpace();
2993 
2994  // Expand unaligned loads earlier than legalization. Due to visitation order
2995  // problems during legalization, the emitted instructions to pack and unpack
2996  // the bytes again are not eliminated in the case of an unaligned copy.
2998  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2999  SDValue Ops[2];
3000 
3001  if (VT.isVector())
3002  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
3003  else
3004  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3005 
3006  return DAG.getMergeValues(Ops, SDLoc(N));
3007  }
3008 
3009  if (!IsFast)
3010  return SDValue();
3011  }
3012 
3013  if (!shouldCombineMemoryType(VT))
3014  return SDValue();
3015 
3016  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3017 
3018  SDValue NewLoad
3019  = DAG.getLoad(NewVT, SL, LN->getChain(),
3020  LN->getBasePtr(), LN->getMemOperand());
3021 
3022  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3023  DCI.CombineTo(N, BC, NewLoad.getValue(1));
3024  return SDValue(N, 0);
3025 }
3026 
3027 // Replace store of an illegal type with a store of a bitcast to a friendlier
3028 // type.
3030  DAGCombinerInfo &DCI) const {
3031  if (!DCI.isBeforeLegalize())
3032  return SDValue();
3033 
3034  StoreSDNode *SN = cast<StoreSDNode>(N);
3035  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3036  return SDValue();
3037 
3038  EVT VT = SN->getMemoryVT();
3039  unsigned Size = VT.getStoreSize();
3040 
3041  SDLoc SL(N);
3042  SelectionDAG &DAG = DCI.DAG;
3043  Align Alignment = SN->getAlign();
3044  if (Alignment < Size && isTypeLegal(VT)) {
3045  bool IsFast;
3046  unsigned AS = SN->getAddressSpace();
3047 
3048  // Expand unaligned stores earlier than legalization. Due to visitation
3049  // order problems during legalization, the emitted instructions to pack and
3050  // unpack the bytes again are not eliminated in the case of an unaligned
3051  // copy.
3053  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3054  if (VT.isVector())
3055  return scalarizeVectorStore(SN, DAG);
3056 
3057  return expandUnalignedStore(SN, DAG);
3058  }
3059 
3060  if (!IsFast)
3061  return SDValue();
3062  }
3063 
3064  if (!shouldCombineMemoryType(VT))
3065  return SDValue();
3066 
3067  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3068  SDValue Val = SN->getValue();
3069 
3070  //DCI.AddToWorklist(Val.getNode());
3071 
3072  bool OtherUses = !Val.hasOneUse();
3073  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3074  if (OtherUses) {
3075  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3076  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3077  }
3078 
3079  return DAG.getStore(SN->getChain(), SL, CastVal,
3080  SN->getBasePtr(), SN->getMemOperand());
3081 }
3082 
3083 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3084 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3085 // issues.
3087  DAGCombinerInfo &DCI) const {
3088  SelectionDAG &DAG = DCI.DAG;
3089  SDValue N0 = N->getOperand(0);
3090 
3091  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3092  // (vt2 (truncate (assertzext vt0:x, vt1)))
3093  if (N0.getOpcode() == ISD::TRUNCATE) {
3094  SDValue N1 = N->getOperand(1);
3095  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3096  SDLoc SL(N);
3097 
3098  SDValue Src = N0.getOperand(0);
3099  EVT SrcVT = Src.getValueType();
3100  if (SrcVT.bitsGE(ExtVT)) {
3101  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3102  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3103  }
3104  }
3105 
3106  return SDValue();
3107 }
3108 
3110  SDNode *N, DAGCombinerInfo &DCI) const {
3111  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3112  switch (IID) {
3113  case Intrinsic::amdgcn_mul_i24:
3114  case Intrinsic::amdgcn_mul_u24:
3115  case Intrinsic::amdgcn_mulhi_i24:
3116  case Intrinsic::amdgcn_mulhi_u24:
3117  return simplifyMul24(N, DCI);
3118  case Intrinsic::amdgcn_fract:
3119  case Intrinsic::amdgcn_rsq:
3120  case Intrinsic::amdgcn_rcp_legacy:
3121  case Intrinsic::amdgcn_rsq_legacy:
3122  case Intrinsic::amdgcn_rsq_clamp:
3123  case Intrinsic::amdgcn_ldexp: {
3124  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3125  SDValue Src = N->getOperand(1);
3126  return Src.isUndef() ? Src : SDValue();
3127  }
3128  default:
3129  return SDValue();
3130  }
3131 }
3132 
3133 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3134 /// binary operation \p Opc to it with the corresponding constant operands.
3136  DAGCombinerInfo &DCI, const SDLoc &SL,
3137  unsigned Opc, SDValue LHS,
3138  uint32_t ValLo, uint32_t ValHi) const {
3139  SelectionDAG &DAG = DCI.DAG;
3140  SDValue Lo, Hi;
3141  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3142 
3143  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3144  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3145 
3146  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3147  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3148 
3149  // Re-visit the ands. It's possible we eliminated one of them and it could
3150  // simplify the vector.
3151  DCI.AddToWorklist(Lo.getNode());
3152  DCI.AddToWorklist(Hi.getNode());
3153 
3154  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3155  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3156 }
3157 
3159  DAGCombinerInfo &DCI) const {
3160  EVT VT = N->getValueType(0);
3161 
3162  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3163  if (!RHS)
3164  return SDValue();
3165 
3166  SDValue LHS = N->getOperand(0);
3167  unsigned RHSVal = RHS->getZExtValue();
3168  if (!RHSVal)
3169  return LHS;
3170 
3171  SDLoc SL(N);
3172  SelectionDAG &DAG = DCI.DAG;
3173 
3174  switch (LHS->getOpcode()) {
3175  default:
3176  break;
3177  case ISD::ZERO_EXTEND:
3178  case ISD::SIGN_EXTEND:
3179  case ISD::ANY_EXTEND: {
3180  SDValue X = LHS->getOperand(0);
3181 
3182  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3184  // Prefer build_vector as the canonical form if packed types are legal.
3185  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3186  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3187  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3188  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3189  }
3190 
3191  // shl (ext x) => zext (shl x), if shift does not overflow int
3192  if (VT != MVT::i64)
3193  break;
3194  KnownBits Known = DAG.computeKnownBits(X);
3195  unsigned LZ = Known.countMinLeadingZeros();
3196  if (LZ < RHSVal)
3197  break;
3198  EVT XVT = X.getValueType();
3199  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3200  return DAG.getZExtOrTrunc(Shl, SL, VT);
3201  }
3202  }
3203 
3204  if (VT != MVT::i64)
3205  return SDValue();
3206 
3207  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3208 
3209  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3210  // common case, splitting this into a move and a 32-bit shift is faster and
3211  // the same code size.
3212  if (RHSVal < 32)
3213  return SDValue();
3214 
3215  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3216 
3217  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3218  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3219 
3220  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3221 
3222  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3223  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3224 }
3225 
3227  DAGCombinerInfo &DCI) const {
3228  if (N->getValueType(0) != MVT::i64)
3229  return SDValue();
3230 
3231  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3232  if (!RHS)
3233  return SDValue();
3234 
3235  SelectionDAG &DAG = DCI.DAG;
3236  SDLoc SL(N);
3237  unsigned RHSVal = RHS->getZExtValue();
3238 
3239  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3240  if (RHSVal == 32) {
3241  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3242  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3243  DAG.getConstant(31, SL, MVT::i32));
3244 
3245  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3246  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3247  }
3248 
3249  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3250  if (RHSVal == 63) {
3251  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3252  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3253  DAG.getConstant(31, SL, MVT::i32));
3254  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3255  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3256  }
3257 
3258  return SDValue();
3259 }
3260 
3262  DAGCombinerInfo &DCI) const {
3263  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3264  if (!RHS)
3265  return SDValue();
3266 
3267  EVT VT = N->getValueType(0);
3268  SDValue LHS = N->getOperand(0);
3269  unsigned ShiftAmt = RHS->getZExtValue();
3270  SelectionDAG &DAG = DCI.DAG;
3271  SDLoc SL(N);
3272 
3273  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3274  // this improves the ability to match BFE patterns in isel.
3275  if (LHS.getOpcode() == ISD::AND) {
3276  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3277  if (Mask->getAPIntValue().isShiftedMask() &&
3278  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3279  return DAG.getNode(
3280  ISD::AND, SL, VT,
3281  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3282  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3283  }
3284  }
3285  }
3286 
3287  if (VT != MVT::i64)
3288  return SDValue();
3289 
3290  if (ShiftAmt < 32)
3291  return SDValue();
3292 
3293  // srl i64:x, C for C >= 32
3294  // =>
3295  // build_pair (srl hi_32(x), C - 32), 0
3296  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3297 
3298  SDValue Hi = getHiHalf64(LHS, DAG);
3299 
3300  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3301  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3302 
3303  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3304 
3305  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3306 }
3307 
3309  SDNode *N, DAGCombinerInfo &DCI) const {
3310  SDLoc SL(N);
3311  SelectionDAG &DAG = DCI.DAG;
3312  EVT VT = N->getValueType(0);
3313  SDValue Src = N->getOperand(0);
3314 
3315  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3316  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3317  SDValue Vec = Src.getOperand(0);
3318  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3319  SDValue Elt0 = Vec.getOperand(0);
3320  EVT EltVT = Elt0.getValueType();
3321  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3322  if (EltVT.isFloatingPoint()) {
3323  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3324  EltVT.changeTypeToInteger(), Elt0);
3325  }
3326 
3327  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3328  }
3329  }
3330  }
3331 
3332  // Equivalent of above for accessing the high element of a vector as an
3333  // integer operation.
3334  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3335  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3336  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3337  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3338  SDValue BV = stripBitcast(Src.getOperand(0));
3339  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3340  BV.getValueType().getVectorNumElements() == 2) {
3341  SDValue SrcElt = BV.getOperand(1);
3342  EVT SrcEltVT = SrcElt.getValueType();
3343  if (SrcEltVT.isFloatingPoint()) {
3344  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3345  SrcEltVT.changeTypeToInteger(), SrcElt);
3346  }
3347 
3348  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3349  }
3350  }
3351  }
3352  }
3353 
3354  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3355  //
3356  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3357  // i16 (trunc (srl (i32 (trunc x), K)))
3358  if (VT.getScalarSizeInBits() < 32) {
3359  EVT SrcVT = Src.getValueType();
3360  if (SrcVT.getScalarSizeInBits() > 32 &&
3361  (Src.getOpcode() == ISD::SRL ||
3362  Src.getOpcode() == ISD::SRA ||
3363  Src.getOpcode() == ISD::SHL)) {
3364  SDValue Amt = Src.getOperand(1);
3365  KnownBits Known = DAG.computeKnownBits(Amt);
3366  unsigned Size = VT.getScalarSizeInBits();
3367  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3368  (Known.countMaxActiveBits() <= Log2_32(Size))) {
3369  EVT MidVT = VT.isVector() ?
3372 
3373  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3374  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3375  Src.getOperand(0));
3376  DCI.AddToWorklist(Trunc.getNode());
3377 
3378  if (Amt.getValueType() != NewShiftVT) {
3379  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3380  DCI.AddToWorklist(Amt.getNode());
3381  }
3382 
3383  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3384  Trunc, Amt);
3385  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3386  }
3387  }
3388  }
3389 
3390  return SDValue();
3391 }
3392 
3393 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3394 // instructions. If we only match on the legalized i64 mul expansion,
3395 // SimplifyDemandedBits will be unable to remove them because there will be
3396 // multiple uses due to the separate mul + mulh[su].
3397 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3398  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3399  if (Size <= 32) {
3400  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3401  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3402  }
3403 
3404  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3405  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3406 
3407  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3408  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3409 
3410  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3411 }
3412 
3414  DAGCombinerInfo &DCI) const {
3415  EVT VT = N->getValueType(0);
3416 
3417  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3418  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3419  // unnecessarily). isDivergent() is used as an approximation of whether the
3420  // value is in an SGPR.
3421  if (!N->isDivergent())
3422  return SDValue();
3423 
3424  unsigned Size = VT.getSizeInBits();
3425  if (VT.isVector() || Size > 64)
3426  return SDValue();
3427 
3428  // There are i16 integer mul/mad.
3429  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3430  return SDValue();
3431 
3432  SelectionDAG &DAG = DCI.DAG;
3433  SDLoc DL(N);
3434 
3435  SDValue N0 = N->getOperand(0);
3436  SDValue N1 = N->getOperand(1);
3437 
3438  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3439  // in the source into any_extends if the result of the mul is truncated. Since
3440  // we can assume the high bits are whatever we want, use the underlying value
3441  // to avoid the unknown high bits from interfering.
3442  if (N0.getOpcode() == ISD::ANY_EXTEND)
3443  N0 = N0.getOperand(0);
3444 
3445  if (N1.getOpcode() == ISD::ANY_EXTEND)
3446  N1 = N1.getOperand(0);
3447 
3448  SDValue Mul;
3449 
3450  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3451  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3452  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3453  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3454  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3455  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3456  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3457  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3458  } else {
3459  return SDValue();
3460  }
3461 
3462  // We need to use sext even for MUL_U24, because MUL_U24 is used
3463  // for signed multiply of 8 and 16-bit types.
3464  return DAG.getSExtOrTrunc(Mul, DL, VT);
3465 }
3466 
3467 SDValue
3469  DAGCombinerInfo &DCI) const {
3470  if (N->getValueType(0) != MVT::i32)
3471  return SDValue();
3472 
3473  SelectionDAG &DAG = DCI.DAG;
3474  SDLoc DL(N);
3475 
3476  SDValue N0 = N->getOperand(0);
3477  SDValue N1 = N->getOperand(1);
3478 
3479  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3480  // in the source into any_extends if the result of the mul is truncated. Since
3481  // we can assume the high bits are whatever we want, use the underlying value
3482  // to avoid the unknown high bits from interfering.
3483  if (N0.getOpcode() == ISD::ANY_EXTEND)
3484  N0 = N0.getOperand(0);
3485  if (N1.getOpcode() == ISD::ANY_EXTEND)
3486  N1 = N1.getOperand(0);
3487 
3488  // Try to use two fast 24-bit multiplies (one for each half of the result)
3489  // instead of one slow extending multiply.
3490  unsigned LoOpcode, HiOpcode;
3491  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3492  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3493  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3494  LoOpcode = AMDGPUISD::MUL_U24;
3495  HiOpcode = AMDGPUISD::MULHI_U24;
3496  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3497  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3498  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3499  LoOpcode = AMDGPUISD::MUL_I24;
3500  HiOpcode = AMDGPUISD::MULHI_I24;
3501  } else {
3502  return SDValue();
3503  }
3504 
3505  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3506  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3507  DCI.CombineTo(N, Lo, Hi);
3508  return SDValue(N, 0);
3509 }
3510 
3512  DAGCombinerInfo &DCI) const {
3513  EVT VT = N->getValueType(0);
3514 
3515  if (!Subtarget->hasMulI24() || VT.isVector())
3516  return SDValue();
3517 
3518  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3519  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3520  // unnecessarily). isDivergent() is used as an approximation of whether the
3521  // value is in an SGPR.
3522  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3523  // valu op anyway)
3524  if (Subtarget->hasSMulHi() && !N->isDivergent())
3525  return SDValue();
3526 
3527  SelectionDAG &DAG = DCI.DAG;
3528  SDLoc DL(N);
3529 
3530  SDValue N0 = N->getOperand(0);
3531  SDValue N1 = N->getOperand(1);
3532 
3533  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3534  return SDValue();
3535 
3536  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3537  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3538 
3539  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3540  DCI.AddToWorklist(Mulhi.getNode());
3541  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3542 }
3543 
3545  DAGCombinerInfo &DCI) const {
3546  EVT VT = N->getValueType(0);
3547 
3548  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3549  return SDValue();
3550 
3551  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3552  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3553  // unnecessarily). isDivergent() is used as an approximation of whether the
3554  // value is in an SGPR.
3555  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3556  // valu op anyway)
3557  if (Subtarget->hasSMulHi() && !N->isDivergent())
3558  return SDValue();
3559 
3560  SelectionDAG &DAG = DCI.DAG;
3561  SDLoc DL(N);
3562 
3563  SDValue N0 = N->getOperand(0);
3564  SDValue N1 = N->getOperand(1);
3565 
3566  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3567  return SDValue();
3568 
3569  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3570  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3571 
3572  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3573  DCI.AddToWorklist(Mulhi.getNode());
3574  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3575 }
3576 
3577 static bool isNegativeOne(SDValue Val) {
3578  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3579  return C->isAllOnes();
3580  return false;
3581 }
3582 
3583 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3584  SDValue Op,
3585  const SDLoc &DL,
3586  unsigned Opc) const {
3587  EVT VT = Op.getValueType();
3588  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3589  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3590  LegalVT != MVT::i16))
3591  return SDValue();
3592 
3593  if (VT != MVT::i32)
3595 
3596  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3597  if (VT != MVT::i32)
3598  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3599 
3600  return FFBX;
3601 }
3602 
3603 // The native instructions return -1 on 0 input. Optimize out a select that
3604 // produces -1 on 0.
3605 //
3606 // TODO: If zero is not undef, we could also do this if the output is compared
3607 // against the bitwidth.
3608 //
3609 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3612  DAGCombinerInfo &DCI) const {
3613  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3614  if (!CmpRhs || !CmpRhs->isZero())
3615  return SDValue();
3616 
3617  SelectionDAG &DAG = DCI.DAG;
3618  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3619  SDValue CmpLHS = Cond.getOperand(0);
3620 
3621  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3622  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3623  if (CCOpcode == ISD::SETEQ &&
3624  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3625  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3626  unsigned Opc =
3628  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3629  }
3630 
3631  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3632  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3633  if (CCOpcode == ISD::SETNE &&
3634  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3635  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3636  unsigned Opc =
3638 
3639  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3640  }
3641 
3642  return SDValue();
3643 }
3644 
3646  unsigned Op,
3647  const SDLoc &SL,
3648  SDValue Cond,
3649  SDValue N1,
3650  SDValue N2) {
3651  SelectionDAG &DAG = DCI.DAG;
3652  EVT VT = N1.getValueType();
3653 
3654  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3655  N1.getOperand(0), N2.getOperand(0));
3656  DCI.AddToWorklist(NewSelect.getNode());
3657  return DAG.getNode(Op, SL, VT, NewSelect);
3658 }
3659 
3660 // Pull a free FP operation out of a select so it may fold into uses.
3661 //
3662 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3663 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3664 //
3665 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3666 // select c, (fabs x), +k -> fabs (select c, x, k)
3668  SDValue N) {
3669  SelectionDAG &DAG = DCI.DAG;
3670  SDValue Cond = N.getOperand(0);
3671  SDValue LHS = N.getOperand(1);
3672  SDValue RHS = N.getOperand(2);
3673 
3674  EVT VT = N.getValueType();
3675  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3676  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3677  return