LLVM  14.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #include "AMDGPUGenCallingConv.inc"
31 
33  "amdgpu-bypass-slow-div",
34  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35  cl::init(true));
36 
37 // Find a larger type to do a load / store of a vector with.
39  unsigned StoreSize = VT.getStoreSizeInBits();
40  if (StoreSize <= 32)
41  return EVT::getIntegerVT(Ctx, StoreSize);
42 
43  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
44  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45 }
46 
48  EVT VT = Op.getValueType();
49  KnownBits Known = DAG.computeKnownBits(Op);
50  return VT.getSizeInBits() - Known.countMinLeadingZeros();
51 }
52 
54  EVT VT = Op.getValueType();
55 
56  // In order for this to be a signed 24-bit value, bit 23, must
57  // be a sign bit.
58  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
59 }
60 
62  const AMDGPUSubtarget &STI)
63  : TargetLowering(TM), Subtarget(&STI) {
64  // Lower floating point store/load to integer store/load to reduce the number
65  // of patterns in tablegen.
68 
71 
74 
77 
80 
83 
86 
89 
92 
95 
98 
101 
104 
107 
110 
113 
116 
119 
122 
125 
128 
131 
132  // There are no 64-bit extloads. These should be done as a 32-bit extload and
133  // an extension to 64-bit.
134  for (MVT VT : MVT::integer_valuetypes()) {
138  }
139 
140  for (MVT VT : MVT::integer_valuetypes()) {
141  if (VT == MVT::i64)
142  continue;
143 
148 
153 
158  }
159 
176  }
177 
185 
192 
199 
202 
205 
208 
211 
214 
217 
220 
223 
226 
229 
232 
235 
238 
241 
244 
247 
250 
253 
256 
259 
262 
265 
270 
275 
283 
286 
289 
294 
299 
302 
310 
315 
318 
319  // This is totally unsupported, just custom lower to produce an error.
321 
322  // Library functions. These default to Expand, but we have instructions
323  // for them.
334 
337 
341 
342 
345 
349 
350  // Expand to fneg + fadd.
352 
395 
399 
400  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
401  for (MVT VT : ScalarIntVTs) {
402  // These should use [SU]DIVREM, so set them to expand
407 
408  // GPU does not have divrem function for signed or unsigned.
411 
412  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
415 
419 
420  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
425  }
426 
427  // The hardware supports 32-bit FSHR, but not FSHL.
429 
430  // The hardware supports 32-bit ROTR, but not ROTL.
434 
437 
446 
451 
456 
457  static const MVT::SimpleValueType VectorIntTypes[] = {
459 
460  for (MVT VT : VectorIntTypes) {
461  // Expand the following operations for the current type by default.
496  }
497 
498  static const MVT::SimpleValueType FloatVectorTypes[] = {
500 
501  for (MVT VT : FloatVectorTypes) {
532  }
533 
534  // This causes using an unrolled select operation rather than expansion with
535  // bit operations. This is in general better, but the alternative using BFI
536  // instructions may be better if the select sources are SGPRs.
539 
542 
545 
548 
551 
554 
555  // There are no libcalls of any kind.
556  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
557  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
558 
560  setJumpIsExpensive(true);
561 
562  // FIXME: This is only partially true. If we have to do vector compares, any
563  // SGPR pair can be a condition register. If we have a uniform condition, we
564  // are better off doing SALU operations, where there is only one SCC. For now,
565  // we don't have a way of knowing during instruction selection if a condition
566  // will be uniform and we always use vector compares. Assume we are using
567  // vector compares until that is fixed.
569 
572 
574 
575  // We want to find all load dependencies for long chains of stores to enable
576  // merging into very wide vectors. The problem is with vectors with > 4
577  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
578  // vectors are a legal type, even though we have to split the loads
579  // usually. When we can more precisely specify load legality per address
580  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
581  // smarter so that they can figure out what to do in 2 iterations without all
582  // N > 4 stores on the same chain.
584 
585  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
586  // about these during lowering.
587  MaxStoresPerMemcpy = 0xffffffff;
588  MaxStoresPerMemmove = 0xffffffff;
589  MaxStoresPerMemset = 0xffffffff;
590 
591  // The expansion for 64-bit division is enormous.
593  addBypassSlowDiv(64, 32);
594 
613 }
614 
616  if (getTargetMachine().Options.NoSignedZerosFPMath)
617  return true;
618 
619  const auto Flags = Op.getNode()->getFlags();
620  if (Flags.hasNoSignedZeros())
621  return true;
622 
623  return false;
624 }
625 
626 //===----------------------------------------------------------------------===//
627 // Target Information
628 //===----------------------------------------------------------------------===//
629 
631 static bool fnegFoldsIntoOp(unsigned Opc) {
632  switch (Opc) {
633  case ISD::FADD:
634  case ISD::FSUB:
635  case ISD::FMUL:
636  case ISD::FMA:
637  case ISD::FMAD:
638  case ISD::FMINNUM:
639  case ISD::FMAXNUM:
640  case ISD::FMINNUM_IEEE:
641  case ISD::FMAXNUM_IEEE:
642  case ISD::FSIN:
643  case ISD::FTRUNC:
644  case ISD::FRINT:
645  case ISD::FNEARBYINT:
646  case ISD::FCANONICALIZE:
647  case AMDGPUISD::RCP:
650  case AMDGPUISD::SIN_HW:
654  case AMDGPUISD::FMED3:
655  // TODO: handle llvm.amdgcn.fma.legacy
656  return true;
657  default:
658  return false;
659  }
660 }
661 
662 /// \p returns true if the operation will definitely need to use a 64-bit
663 /// encoding, and thus will use a VOP3 encoding regardless of the source
664 /// modifiers.
666 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
667  return N->getNumOperands() > 2 || VT == MVT::f64;
668 }
669 
670 // Most FP instructions support source modifiers, but this could be refined
671 // slightly.
673 static bool hasSourceMods(const SDNode *N) {
674  if (isa<MemSDNode>(N))
675  return false;
676 
677  switch (N->getOpcode()) {
678  case ISD::CopyToReg:
679  case ISD::SELECT:
680  case ISD::FDIV:
681  case ISD::FREM:
682  case ISD::INLINEASM:
683  case ISD::INLINEASM_BR:
686 
687  // TODO: Should really be looking at the users of the bitcast. These are
688  // problematic because bitcasts are used to legalize all stores to integer
689  // types.
690  case ISD::BITCAST:
691  return false;
693  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
694  case Intrinsic::amdgcn_interp_p1:
695  case Intrinsic::amdgcn_interp_p2:
696  case Intrinsic::amdgcn_interp_mov:
697  case Intrinsic::amdgcn_interp_p1_f16:
698  case Intrinsic::amdgcn_interp_p2_f16:
699  return false;
700  default:
701  return true;
702  }
703  }
704  default:
705  return true;
706  }
707 }
708 
710  unsigned CostThreshold) {
711  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
712  // it is truly free to use a source modifier in all cases. If there are
713  // multiple users but for each one will necessitate using VOP3, there will be
714  // a code size increase. Try to avoid increasing code size unless we know it
715  // will save on the instruction count.
716  unsigned NumMayIncreaseSize = 0;
717  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
718 
719  // XXX - Should this limit number of uses to check?
720  for (const SDNode *U : N->uses()) {
721  if (!hasSourceMods(U))
722  return false;
723 
724  if (!opMustUseVOP3Encoding(U, VT)) {
725  if (++NumMayIncreaseSize > CostThreshold)
726  return false;
727  }
728  }
729 
730  return true;
731 }
732 
734  ISD::NodeType ExtendKind) const {
735  assert(!VT.isVector() && "only scalar expected");
736 
737  // Round to the next multiple of 32-bits.
738  unsigned Size = VT.getSizeInBits();
739  if (Size <= 32)
740  return MVT::i32;
741  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
742 }
743 
745  return MVT::i32;
746 }
747 
749  return true;
750 }
751 
752 // The backend supports 32 and 64 bit floating point immediates.
753 // FIXME: Why are we reporting vectors of FP immediates as legal?
755  bool ForCodeSize) const {
756  EVT ScalarVT = VT.getScalarType();
757  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
758  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
759 }
760 
761 // We don't want to shrink f64 / f32 constants.
763  EVT ScalarVT = VT.getScalarType();
764  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
765 }
766 
768  ISD::LoadExtType ExtTy,
769  EVT NewVT) const {
770  // TODO: This may be worth removing. Check regression tests for diffs.
771  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
772  return false;
773 
774  unsigned NewSize = NewVT.getStoreSizeInBits();
775 
776  // If we are reducing to a 32-bit load or a smaller multi-dword load,
777  // this is always better.
778  if (NewSize >= 32)
779  return true;
780 
781  EVT OldVT = N->getValueType(0);
782  unsigned OldSize = OldVT.getStoreSizeInBits();
783 
784  MemSDNode *MN = cast<MemSDNode>(N);
785  unsigned AS = MN->getAddressSpace();
786  // Do not shrink an aligned scalar load to sub-dword.
787  // Scalar engine cannot do sub-dword loads.
788  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
791  (isa<LoadSDNode>(N) &&
792  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
794  return false;
795 
796  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
797  // extloads, so doing one requires using a buffer_load. In cases where we
798  // still couldn't use a scalar load, using the wider load shouldn't really
799  // hurt anything.
800 
801  // If the old size already had to be an extload, there's no harm in continuing
802  // to reduce the width.
803  return (OldSize < 32);
804 }
805 
807  const SelectionDAG &DAG,
808  const MachineMemOperand &MMO) const {
809 
810  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
811 
812  if (LoadTy.getScalarType() == MVT::i32)
813  return false;
814 
815  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
816  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
817 
818  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
819  return false;
820 
821  bool Fast = false;
823  CastTy, MMO, &Fast) &&
824  Fast;
825 }
826 
827 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
828 // profitable with the expansion for 64-bit since it's generally good to
829 // speculate things.
830 // FIXME: These should really have the size as a parameter.
832  return true;
833 }
834 
836  return true;
837 }
838 
840  switch (N->getOpcode()) {
841  case ISD::EntryToken:
842  case ISD::TokenFactor:
843  return true;
845  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
846  switch (IntrID) {
847  case Intrinsic::amdgcn_readfirstlane:
848  case Intrinsic::amdgcn_readlane:
849  return true;
850  }
851  return false;
852  }
853  case ISD::LOAD:
854  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
856  return true;
857  return false;
858  }
859  return false;
860 }
861 
863  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
864  NegatibleCost &Cost, unsigned Depth) const {
865 
866  switch (Op.getOpcode()) {
867  case ISD::FMA:
868  case ISD::FMAD: {
869  // Negating a fma is not free if it has users without source mods.
870  if (!allUsesHaveSourceMods(Op.getNode()))
871  return SDValue();
872  break;
873  }
874  default:
875  break;
876  }
877 
878  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
879  ForCodeSize, Cost, Depth);
880 }
881 
882 //===---------------------------------------------------------------------===//
883 // Target Properties
884 //===---------------------------------------------------------------------===//
885 
887  assert(VT.isFloatingPoint());
888 
889  // Packed operations do not have a fabs modifier.
890  return VT == MVT::f32 || VT == MVT::f64 ||
891  (Subtarget->has16BitInsts() && VT == MVT::f16);
892 }
893 
895  assert(VT.isFloatingPoint());
896  // Report this based on the end legalized type.
897  VT = VT.getScalarType();
898  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
899 }
900 
902  unsigned NumElem,
903  unsigned AS) const {
904  return true;
905 }
906 
908  // There are few operations which truly have vector input operands. Any vector
909  // operation is going to involve operations on each component, and a
910  // build_vector will be a copy per element, so it always makes sense to use a
911  // build_vector input in place of the extracted element to avoid a copy into a
912  // super register.
913  //
914  // We should probably only do this if all users are extracts only, but this
915  // should be the common case.
916  return true;
917 }
918 
920  // Truncate is just accessing a subregister.
921 
922  unsigned SrcSize = Source.getSizeInBits();
923  unsigned DestSize = Dest.getSizeInBits();
924 
925  return DestSize < SrcSize && DestSize % 32 == 0 ;
926 }
927 
929  // Truncate is just accessing a subregister.
930 
931  unsigned SrcSize = Source->getScalarSizeInBits();
932  unsigned DestSize = Dest->getScalarSizeInBits();
933 
934  if (DestSize== 16 && Subtarget->has16BitInsts())
935  return SrcSize >= 32;
936 
937  return DestSize < SrcSize && DestSize % 32 == 0;
938 }
939 
940 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
941  unsigned SrcSize = Src->getScalarSizeInBits();
942  unsigned DestSize = Dest->getScalarSizeInBits();
943 
944  if (SrcSize == 16 && Subtarget->has16BitInsts())
945  return DestSize >= 32;
946 
947  return SrcSize == 32 && DestSize == 64;
948 }
949 
950 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
951  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
952  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
953  // this will enable reducing 64-bit operations the 32-bit, which is always
954  // good.
955 
956  if (Src == MVT::i16)
957  return Dest == MVT::i32 ||Dest == MVT::i64 ;
958 
959  return Src == MVT::i32 && Dest == MVT::i64;
960 }
961 
963  return isZExtFree(Val.getValueType(), VT2);
964 }
965 
967  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
968  // limited number of native 64-bit operations. Shrinking an operation to fit
969  // in a single 32-bit register should always be helpful. As currently used,
970  // this is much less general than the name suggests, and is only used in
971  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
972  // not profitable, and may actually be harmful.
973  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
974 }
975 
976 //===---------------------------------------------------------------------===//
977 // TargetLowering Callbacks
978 //===---------------------------------------------------------------------===//
979 
981  bool IsVarArg) {
982  switch (CC) {
990  return CC_AMDGPU;
991  case CallingConv::C:
992  case CallingConv::Fast:
993  case CallingConv::Cold:
994  return CC_AMDGPU_Func;
996  return CC_SI_Gfx;
999  default:
1000  report_fatal_error("Unsupported calling convention for call");
1001  }
1002 }
1003 
1005  bool IsVarArg) {
1006  switch (CC) {
1009  llvm_unreachable("kernels should not be handled here");
1017  return RetCC_SI_Shader;
1019  return RetCC_SI_Gfx;
1020  case CallingConv::C:
1021  case CallingConv::Fast:
1022  case CallingConv::Cold:
1023  return RetCC_AMDGPU_Func;
1024  default:
1025  report_fatal_error("Unsupported calling convention.");
1026  }
1027 }
1028 
1029 /// The SelectionDAGBuilder will automatically promote function arguments
1030 /// with illegal types. However, this does not work for the AMDGPU targets
1031 /// since the function arguments are stored in memory as these illegal types.
1032 /// In order to handle this properly we need to get the original types sizes
1033 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1034 /// passing them to AnalyzeFormalArguments()
1035 
1036 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037 /// input values across multiple registers. Each item in the Ins array
1038 /// represents a single value that will be stored in registers. Ins[x].VT is
1039 /// the value type of the value that will be stored in the register, so
1040 /// whatever SDNode we lower the argument to needs to be this type.
1041 ///
1042 /// In order to correctly lower the arguments we need to know the size of each
1043 /// argument. Since Ins[x].VT gives us the size of the register that will
1044 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045 /// for the original function argument so that we can deduce the correct memory
1046 /// type to use for Ins[x]. In most cases the correct memory type will be
1047 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048 /// we have a kernel argument of type v8i8, this argument will be split into
1049 /// 8 parts and each part will be represented by its own item in the Ins array.
1050 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051 /// the argument before it was split. From this, we deduce that the memory type
1052 /// for each individual part is i8. We pass the memory type as LocVT to the
1053 /// calling convention analysis function and the register type (Ins[x].VT) as
1054 /// the ValVT.
1056  CCState &State,
1057  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058  const MachineFunction &MF = State.getMachineFunction();
1059  const Function &Fn = MF.getFunction();
1060  LLVMContext &Ctx = Fn.getParent()->getContext();
1062  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063  CallingConv::ID CC = Fn.getCallingConv();
1064 
1065  Align MaxAlign = Align(1);
1066  uint64_t ExplicitArgOffset = 0;
1067  const DataLayout &DL = Fn.getParent()->getDataLayout();
1068 
1069  unsigned InIndex = 0;
1070 
1071  for (const Argument &Arg : Fn.args()) {
1072  const bool IsByRef = Arg.hasByRefAttr();
1073  Type *BaseArgTy = Arg.getType();
1074  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1076  if (!Alignment)
1077  Alignment = DL.getABITypeAlign(MemArgTy);
1078  MaxAlign = max(Alignment, MaxAlign);
1079  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1080 
1081  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1082  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1083 
1084  // We're basically throwing away everything passed into us and starting over
1085  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1086  // to us as computed in Ins.
1087  //
1088  // We also need to figure out what type legalization is trying to do to get
1089  // the correct memory offsets.
1090 
1091  SmallVector<EVT, 16> ValueVTs;
1093  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1094 
1095  for (unsigned Value = 0, NumValues = ValueVTs.size();
1096  Value != NumValues; ++Value) {
1097  uint64_t BasePartOffset = Offsets[Value];
1098 
1099  EVT ArgVT = ValueVTs[Value];
1100  EVT MemVT = ArgVT;
1101  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1102  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1103 
1104  if (NumRegs == 1) {
1105  // This argument is not split, so the IR type is the memory type.
1106  if (ArgVT.isExtended()) {
1107  // We have an extended type, like i24, so we should just use the
1108  // register type.
1109  MemVT = RegisterVT;
1110  } else {
1111  MemVT = ArgVT;
1112  }
1113  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1114  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1115  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1116  // We have a vector value which has been split into a vector with
1117  // the same scalar type, but fewer elements. This should handle
1118  // all the floating-point vector types.
1119  MemVT = RegisterVT;
1120  } else if (ArgVT.isVector() &&
1121  ArgVT.getVectorNumElements() == NumRegs) {
1122  // This arg has been split so that each element is stored in a separate
1123  // register.
1124  MemVT = ArgVT.getScalarType();
1125  } else if (ArgVT.isExtended()) {
1126  // We have an extended type, like i65.
1127  MemVT = RegisterVT;
1128  } else {
1129  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1130  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1131  if (RegisterVT.isInteger()) {
1132  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1133  } else if (RegisterVT.isVector()) {
1134  assert(!RegisterVT.getScalarType().isFloatingPoint());
1135  unsigned NumElements = RegisterVT.getVectorNumElements();
1136  assert(MemoryBits % NumElements == 0);
1137  // This vector type has been split into another vector type with
1138  // a different elements size.
1139  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1140  MemoryBits / NumElements);
1141  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1142  } else {
1143  llvm_unreachable("cannot deduce memory type.");
1144  }
1145  }
1146 
1147  // Convert one element vectors to scalar.
1148  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1149  MemVT = MemVT.getScalarType();
1150 
1151  // Round up vec3/vec5 argument.
1152  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1153  assert(MemVT.getVectorNumElements() == 3 ||
1154  MemVT.getVectorNumElements() == 5);
1155  MemVT = MemVT.getPow2VectorType(State.getContext());
1156  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1157  MemVT = MemVT.getRoundIntegerType(State.getContext());
1158  }
1159 
1160  unsigned PartOffset = 0;
1161  for (unsigned i = 0; i != NumRegs; ++i) {
1162  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1163  BasePartOffset + PartOffset,
1164  MemVT.getSimpleVT(),
1166  PartOffset += MemVT.getStoreSize();
1167  }
1168  }
1169  }
1170 }
1171 
1173  SDValue Chain, CallingConv::ID CallConv,
1174  bool isVarArg,
1175  const SmallVectorImpl<ISD::OutputArg> &Outs,
1176  const SmallVectorImpl<SDValue> &OutVals,
1177  const SDLoc &DL, SelectionDAG &DAG) const {
1178  // FIXME: Fails for r600 tests
1179  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1180  // "wave terminate should not have return values");
1181  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1182 }
1183 
1184 //===---------------------------------------------------------------------===//
1185 // Target specific lowering
1186 //===---------------------------------------------------------------------===//
1187 
1188 /// Selects the correct CCAssignFn for a given CallingConvention value.
1190  bool IsVarArg) {
1191  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1192 }
1193 
1195  bool IsVarArg) {
1196  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1197 }
1198 
1200  SelectionDAG &DAG,
1201  MachineFrameInfo &MFI,
1202  int ClobberedFI) const {
1203  SmallVector<SDValue, 8> ArgChains;
1204  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1205  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1206 
1207  // Include the original chain at the beginning of the list. When this is
1208  // used by target LowerCall hooks, this helps legalize find the
1209  // CALLSEQ_BEGIN node.
1210  ArgChains.push_back(Chain);
1211 
1212  // Add a chain value for each stack argument corresponding
1213  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1214  UE = DAG.getEntryNode().getNode()->use_end();
1215  U != UE; ++U) {
1216  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1217  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1218  if (FI->getIndex() < 0) {
1219  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1220  int64_t InLastByte = InFirstByte;
1221  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1222 
1223  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1224  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1225  ArgChains.push_back(SDValue(L, 1));
1226  }
1227  }
1228  }
1229  }
1230 
1231  // Build a tokenfactor for all the chains.
1232  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1233 }
1234 
1236  SmallVectorImpl<SDValue> &InVals,
1237  StringRef Reason) const {
1238  SDValue Callee = CLI.Callee;
1239  SelectionDAG &DAG = CLI.DAG;
1240 
1241  const Function &Fn = DAG.getMachineFunction().getFunction();
1242 
1243  StringRef FuncName("<unknown>");
1244 
1245  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1246  FuncName = G->getSymbol();
1247  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1248  FuncName = G->getGlobal()->getName();
1249 
1250  DiagnosticInfoUnsupported NoCalls(
1251  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1252  DAG.getContext()->diagnose(NoCalls);
1253 
1254  if (!CLI.IsTailCall) {
1255  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1256  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1257  }
1258 
1259  return DAG.getEntryNode();
1260 }
1261 
1263  SmallVectorImpl<SDValue> &InVals) const {
1264  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1265 }
1266 
1268  SelectionDAG &DAG) const {
1269  const Function &Fn = DAG.getMachineFunction().getFunction();
1270 
1271  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1272  SDLoc(Op).getDebugLoc());
1273  DAG.getContext()->diagnose(NoDynamicAlloca);
1274  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1275  return DAG.getMergeValues(Ops, SDLoc());
1276 }
1277 
1279  SelectionDAG &DAG) const {
1280  switch (Op.getOpcode()) {
1281  default:
1282  Op->print(errs(), &DAG);
1283  llvm_unreachable("Custom lowering code for this "
1284  "instruction is not implemented yet!");
1285  break;
1287  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1289  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1290  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1291  case ISD::FREM: return LowerFREM(Op, DAG);
1292  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1293  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1294  case ISD::FRINT: return LowerFRINT(Op, DAG);
1295  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1296  case ISD::FROUND: return LowerFROUND(Op, DAG);
1297  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1298  case ISD::FLOG:
1299  return LowerFLOG(Op, DAG, numbers::ln2f);
1300  case ISD::FLOG10:
1301  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1302  case ISD::FEXP:
1303  return lowerFEXP(Op, DAG);
1304  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1305  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1306  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1307  case ISD::FP_TO_SINT:
1308  case ISD::FP_TO_UINT:
1309  return LowerFP_TO_INT(Op, DAG);
1310  case ISD::CTTZ:
1311  case ISD::CTTZ_ZERO_UNDEF:
1312  case ISD::CTLZ:
1313  case ISD::CTLZ_ZERO_UNDEF:
1314  return LowerCTLZ_CTTZ(Op, DAG);
1316  }
1317  return Op;
1318 }
1319 
1322  SelectionDAG &DAG) const {
1323  switch (N->getOpcode()) {
1325  // Different parts of legalization seem to interpret which type of
1326  // sign_extend_inreg is the one to check for custom lowering. The extended
1327  // from type is what really matters, but some places check for custom
1328  // lowering of the result type. This results in trying to use
1329  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1330  // nothing here and let the illegal result integer be handled normally.
1331  return;
1332  default:
1333  return;
1334  }
1335 }
1336 
1338  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1339  if (!GVar || !GVar->hasInitializer())
1340  return false;
1341 
1342  return !isa<UndefValue>(GVar->getInitializer());
1343 }
1344 
1346  SDValue Op,
1347  SelectionDAG &DAG) const {
1348 
1349  const DataLayout &DL = DAG.getDataLayout();
1350  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1351  const GlobalValue *GV = G->getGlobal();
1352 
1353  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1354  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1355  if (!MFI->isModuleEntryFunction() &&
1356  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1357  SDLoc DL(Op);
1358  const Function &Fn = DAG.getMachineFunction().getFunction();
1359  DiagnosticInfoUnsupported BadLDSDecl(
1360  Fn, "local memory global used by non-kernel function",
1361  DL.getDebugLoc(), DS_Warning);
1362  DAG.getContext()->diagnose(BadLDSDecl);
1363 
1364  // We currently don't have a way to correctly allocate LDS objects that
1365  // aren't directly associated with a kernel. We do force inlining of
1366  // functions that use local objects. However, if these dead functions are
1367  // not eliminated, we don't want a compile time error. Just emit a warning
1368  // and a trap, since there should be no callable path here.
1369  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1370  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1371  Trap, DAG.getRoot());
1372  DAG.setRoot(OutputChain);
1373  return DAG.getUNDEF(Op.getValueType());
1374  }
1375 
1376  // XXX: What does the value of G->getOffset() mean?
1377  assert(G->getOffset() == 0 &&
1378  "Do not know what to do with an non-zero offset");
1379 
1380  // TODO: We could emit code to handle the initialization somewhere.
1381  // We ignore the initializer for now and legalize it to allow selection.
1382  // The initializer will anyway get errored out during assembly emission.
1383  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1384  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1385  }
1386  return SDValue();
1387 }
1388 
1390  SelectionDAG &DAG) const {
1392 
1393  EVT VT = Op.getValueType();
1394  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1395  SDLoc SL(Op);
1396  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1397  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1398 
1399  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1400  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1401  }
1402 
1403  for (const SDUse &U : Op->ops())
1404  DAG.ExtractVectorElements(U.get(), Args);
1405 
1406  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1407 }
1408 
1410  SelectionDAG &DAG) const {
1411 
1413  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1414  EVT VT = Op.getValueType();
1415  EVT SrcVT = Op.getOperand(0).getValueType();
1416 
1417  // For these types, we have some TableGen patterns except if the index is 1
1418  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1419  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1420  Start != 1)
1421  return Op;
1422 
1423  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1424  VT.getVectorNumElements());
1425 
1426  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1427 }
1428 
1429 /// Generate Min/Max node
1431  SDValue LHS, SDValue RHS,
1432  SDValue True, SDValue False,
1433  SDValue CC,
1434  DAGCombinerInfo &DCI) const {
1435  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1436  return SDValue();
1437 
1438  SelectionDAG &DAG = DCI.DAG;
1439  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1440  switch (CCOpcode) {
1441  case ISD::SETOEQ:
1442  case ISD::SETONE:
1443  case ISD::SETUNE:
1444  case ISD::SETNE:
1445  case ISD::SETUEQ:
1446  case ISD::SETEQ:
1447  case ISD::SETFALSE:
1448  case ISD::SETFALSE2:
1449  case ISD::SETTRUE:
1450  case ISD::SETTRUE2:
1451  case ISD::SETUO:
1452  case ISD::SETO:
1453  break;
1454  case ISD::SETULE:
1455  case ISD::SETULT: {
1456  if (LHS == True)
1457  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1458  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1459  }
1460  case ISD::SETOLE:
1461  case ISD::SETOLT:
1462  case ISD::SETLE:
1463  case ISD::SETLT: {
1464  // Ordered. Assume ordered for undefined.
1465 
1466  // Only do this after legalization to avoid interfering with other combines
1467  // which might occur.
1468  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1469  !DCI.isCalledByLegalizer())
1470  return SDValue();
1471 
1472  // We need to permute the operands to get the correct NaN behavior. The
1473  // selected operand is the second one based on the failing compare with NaN,
1474  // so permute it based on the compare type the hardware uses.
1475  if (LHS == True)
1476  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1477  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1478  }
1479  case ISD::SETUGE:
1480  case ISD::SETUGT: {
1481  if (LHS == True)
1482  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1483  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1484  }
1485  case ISD::SETGT:
1486  case ISD::SETGE:
1487  case ISD::SETOGE:
1488  case ISD::SETOGT: {
1489  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1490  !DCI.isCalledByLegalizer())
1491  return SDValue();
1492 
1493  if (LHS == True)
1494  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1495  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1496  }
1497  case ISD::SETCC_INVALID:
1498  llvm_unreachable("Invalid setcc condcode!");
1499  }
1500  return SDValue();
1501 }
1502 
1503 std::pair<SDValue, SDValue>
1505  SDLoc SL(Op);
1506 
1507  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1508 
1509  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1510  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1511 
1512  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1513  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1514 
1515  return std::make_pair(Lo, Hi);
1516 }
1517 
1519  SDLoc SL(Op);
1520 
1521  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1522  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1523  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1524 }
1525 
1527  SDLoc SL(Op);
1528 
1529  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1530  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1531  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1532 }
1533 
1534 // Split a vector type into two parts. The first part is a power of two vector.
1535 // The second part is whatever is left over, and is a scalar if it would
1536 // otherwise be a 1-vector.
1537 std::pair<EVT, EVT>
1539  EVT LoVT, HiVT;
1540  EVT EltVT = VT.getVectorElementType();
1541  unsigned NumElts = VT.getVectorNumElements();
1542  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1543  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1544  HiVT = NumElts - LoNumElts == 1
1545  ? EltVT
1546  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1547  return std::make_pair(LoVT, HiVT);
1548 }
1549 
1550 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1551 // scalar.
1552 std::pair<SDValue, SDValue>
1554  const EVT &LoVT, const EVT &HiVT,
1555  SelectionDAG &DAG) const {
1556  assert(LoVT.getVectorNumElements() +
1557  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1558  N.getValueType().getVectorNumElements() &&
1559  "More vector elements requested than available!");
1560  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1561  DAG.getVectorIdxConstant(0, DL));
1562  SDValue Hi = DAG.getNode(
1564  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1565  return std::make_pair(Lo, Hi);
1566 }
1567 
1569  SelectionDAG &DAG) const {
1570  LoadSDNode *Load = cast<LoadSDNode>(Op);
1571  EVT VT = Op.getValueType();
1572  SDLoc SL(Op);
1573 
1574 
1575  // If this is a 2 element vector, we really want to scalarize and not create
1576  // weird 1 element vectors.
1577  if (VT.getVectorNumElements() == 2) {
1578  SDValue Ops[2];
1579  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1580  return DAG.getMergeValues(Ops, SL);
1581  }
1582 
1583  SDValue BasePtr = Load->getBasePtr();
1584  EVT MemVT = Load->getMemoryVT();
1585 
1586  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1587 
1588  EVT LoVT, HiVT;
1589  EVT LoMemVT, HiMemVT;
1590  SDValue Lo, Hi;
1591 
1592  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1593  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1594  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1595 
1596  unsigned Size = LoMemVT.getStoreSize();
1597  unsigned BaseAlign = Load->getAlignment();
1598  unsigned HiAlign = MinAlign(BaseAlign, Size);
1599 
1600  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1601  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1602  BaseAlign, Load->getMemOperand()->getFlags());
1603  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1604  SDValue HiLoad =
1605  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1606  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1607  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1608 
1609  SDValue Join;
1610  if (LoVT == HiVT) {
1611  // This is the case that the vector is power of two so was evenly split.
1612  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1613  } else {
1614  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1615  DAG.getVectorIdxConstant(0, SL));
1616  Join = DAG.getNode(
1618  VT, Join, HiLoad,
1619  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1620  }
1621 
1622  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1623  LoLoad.getValue(1), HiLoad.getValue(1))};
1624 
1625  return DAG.getMergeValues(Ops, SL);
1626 }
1627 
1629  SelectionDAG &DAG) const {
1630  LoadSDNode *Load = cast<LoadSDNode>(Op);
1631  EVT VT = Op.getValueType();
1632  SDValue BasePtr = Load->getBasePtr();
1633  EVT MemVT = Load->getMemoryVT();
1634  SDLoc SL(Op);
1635  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1636  unsigned BaseAlign = Load->getAlignment();
1637  unsigned NumElements = MemVT.getVectorNumElements();
1638 
1639  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1640  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1641  if (NumElements != 3 ||
1642  (BaseAlign < 8 &&
1643  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1644  return SplitVectorLoad(Op, DAG);
1645 
1646  assert(NumElements == 3);
1647 
1648  EVT WideVT =
1650  EVT WideMemVT =
1651  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1652  SDValue WideLoad = DAG.getExtLoad(
1653  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1654  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1655  return DAG.getMergeValues(
1656  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1657  DAG.getVectorIdxConstant(0, SL)),
1658  WideLoad.getValue(1)},
1659  SL);
1660 }
1661 
1663  SelectionDAG &DAG) const {
1664  StoreSDNode *Store = cast<StoreSDNode>(Op);
1665  SDValue Val = Store->getValue();
1666  EVT VT = Val.getValueType();
1667 
1668  // If this is a 2 element vector, we really want to scalarize and not create
1669  // weird 1 element vectors.
1670  if (VT.getVectorNumElements() == 2)
1671  return scalarizeVectorStore(Store, DAG);
1672 
1673  EVT MemVT = Store->getMemoryVT();
1674  SDValue Chain = Store->getChain();
1675  SDValue BasePtr = Store->getBasePtr();
1676  SDLoc SL(Op);
1677 
1678  EVT LoVT, HiVT;
1679  EVT LoMemVT, HiMemVT;
1680  SDValue Lo, Hi;
1681 
1682  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1683  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1684  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1685 
1686  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1687 
1688  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1689  unsigned BaseAlign = Store->getAlignment();
1690  unsigned Size = LoMemVT.getStoreSize();
1691  unsigned HiAlign = MinAlign(BaseAlign, Size);
1692 
1693  SDValue LoStore =
1694  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1695  Store->getMemOperand()->getFlags());
1696  SDValue HiStore =
1697  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1698  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1699 
1700  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1701 }
1702 
1703 // This is a shortcut for integer division because we have fast i32<->f32
1704 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1705 // float is enough to accurately represent up to a 24-bit signed integer.
1707  bool Sign) const {
1708  SDLoc DL(Op);
1709  EVT VT = Op.getValueType();
1710  SDValue LHS = Op.getOperand(0);
1711  SDValue RHS = Op.getOperand(1);
1712  MVT IntVT = MVT::i32;
1713  MVT FltVT = MVT::f32;
1714 
1715  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1716  if (LHSSignBits < 9)
1717  return SDValue();
1718 
1719  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1720  if (RHSSignBits < 9)
1721  return SDValue();
1722 
1723  unsigned BitSize = VT.getSizeInBits();
1724  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1725  unsigned DivBits = BitSize - SignBits;
1726  if (Sign)
1727  ++DivBits;
1728 
1731 
1732  SDValue jq = DAG.getConstant(1, DL, IntVT);
1733 
1734  if (Sign) {
1735  // char|short jq = ia ^ ib;
1736  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1737 
1738  // jq = jq >> (bitsize - 2)
1739  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1740  DAG.getConstant(BitSize - 2, DL, VT));
1741 
1742  // jq = jq | 0x1
1743  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1744  }
1745 
1746  // int ia = (int)LHS;
1747  SDValue ia = LHS;
1748 
1749  // int ib, (int)RHS;
1750  SDValue ib = RHS;
1751 
1752  // float fa = (float)ia;
1753  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1754 
1755  // float fb = (float)ib;
1756  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1757 
1758  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1759  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1760 
1761  // fq = trunc(fq);
1762  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1763 
1764  // float fqneg = -fq;
1765  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1766 
1767  MachineFunction &MF = DAG.getMachineFunction();
1769 
1770  // float fr = mad(fqneg, fb, fa);
1771  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1772  (unsigned)ISD::FMA :
1773  !MFI->getMode().allFP32Denormals() ?
1774  (unsigned)ISD::FMAD :
1775  (unsigned)AMDGPUISD::FMAD_FTZ;
1776  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1777 
1778  // int iq = (int)fq;
1779  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1780 
1781  // fr = fabs(fr);
1782  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1783 
1784  // fb = fabs(fb);
1785  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1786 
1787  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1788 
1789  // int cv = fr >= fb;
1790  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1791 
1792  // jq = (cv ? jq : 0);
1793  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1794 
1795  // dst = iq + jq;
1796  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1797 
1798  // Rem needs compensation, it's easier to recompute it
1799  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1800  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1801 
1802  // Truncate to number of bits this divide really is.
1803  if (Sign) {
1804  SDValue InRegSize
1805  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1806  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1807  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1808  } else {
1809  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1810  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1811  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1812  }
1813 
1814  return DAG.getMergeValues({ Div, Rem }, DL);
1815 }
1816 
1818  SelectionDAG &DAG,
1820  SDLoc DL(Op);
1821  EVT VT = Op.getValueType();
1822 
1823  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1824 
1825  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1826 
1827  SDValue One = DAG.getConstant(1, DL, HalfVT);
1828  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1829 
1830  //HiLo split
1831  SDValue LHS = Op.getOperand(0);
1832  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1833  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1834 
1835  SDValue RHS = Op.getOperand(1);
1836  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1837  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1838 
1839  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1840  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1841 
1842  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1843  LHS_Lo, RHS_Lo);
1844 
1845  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1846  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1847 
1848  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1849  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1850  return;
1851  }
1852 
1853  if (isTypeLegal(MVT::i64)) {
1854  MachineFunction &MF = DAG.getMachineFunction();
1856 
1857  // Compute denominator reciprocal.
1858  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1859  (unsigned)ISD::FMA :
1860  !MFI->getMode().allFP32Denormals() ?
1861  (unsigned)ISD::FMAD :
1862  (unsigned)AMDGPUISD::FMAD_FTZ;
1863 
1864  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1865  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1866  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1867  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1868  Cvt_Lo);
1869  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1870  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1871  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1872  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1873  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1874  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1875  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1876  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1877  Mul1);
1878  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1879  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1880  SDValue Rcp64 = DAG.getBitcast(VT,
1881  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1882 
1883  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1884  SDValue One64 = DAG.getConstant(1, DL, VT);
1885  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1886  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1887 
1888  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1889  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1890  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1891  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1892  Zero);
1893  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1894  One);
1895 
1896  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1897  Mulhi1_Lo, Zero1);
1898  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1899  Mulhi1_Hi, Add1_Lo.getValue(1));
1900  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1901  SDValue Add1 = DAG.getBitcast(VT,
1902  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1903 
1904  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1905  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1906  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1907  Zero);
1908  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1909  One);
1910 
1911  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1912  Mulhi2_Lo, Zero1);
1913  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1914  Mulhi2_Hi, Add1_Lo.getValue(1));
1915  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1916  Zero, Add2_Lo.getValue(1));
1917  SDValue Add2 = DAG.getBitcast(VT,
1918  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1919  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1920 
1921  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1922 
1923  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1924  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1925  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1926  Mul3_Lo, Zero1);
1927  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1928  Mul3_Hi, Sub1_Lo.getValue(1));
1929  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1930  SDValue Sub1 = DAG.getBitcast(VT,
1931  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1932 
1933  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1934  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1935  ISD::SETUGE);
1936  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1937  ISD::SETUGE);
1938  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1939 
1940  // TODO: Here and below portions of the code can be enclosed into if/endif.
1941  // Currently control flow is unconditional and we have 4 selects after
1942  // potential endif to substitute PHIs.
1943 
1944  // if C3 != 0 ...
1945  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1946  RHS_Lo, Zero1);
1947  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1948  RHS_Hi, Sub1_Lo.getValue(1));
1949  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1950  Zero, Sub2_Lo.getValue(1));
1951  SDValue Sub2 = DAG.getBitcast(VT,
1952  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1953 
1954  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1955 
1956  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1957  ISD::SETUGE);
1958  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1959  ISD::SETUGE);
1960  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1961 
1962  // if (C6 != 0)
1963  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1964 
1965  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1966  RHS_Lo, Zero1);
1967  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1968  RHS_Hi, Sub2_Lo.getValue(1));
1969  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1970  Zero, Sub3_Lo.getValue(1));
1971  SDValue Sub3 = DAG.getBitcast(VT,
1972  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1973 
1974  // endif C6
1975  // endif C3
1976 
1977  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1978  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1979 
1980  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1981  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1982 
1983  Results.push_back(Div);
1984  Results.push_back(Rem);
1985 
1986  return;
1987  }
1988 
1989  // r600 expandion.
1990  // Get Speculative values
1991  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1992  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1993 
1994  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1995  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1996  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1997 
1998  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1999  SDValue DIV_Lo = Zero;
2000 
2001  const unsigned halfBitWidth = HalfVT.getSizeInBits();
2002 
2003  for (unsigned i = 0; i < halfBitWidth; ++i) {
2004  const unsigned bitPos = halfBitWidth - i - 1;
2005  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2006  // Get value of high bit
2007  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2008  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2009  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2010 
2011  // Shift
2012  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2013  // Add LHS high bit
2014  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2015 
2016  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2017  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2018 
2019  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2020 
2021  // Update REM
2022  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2023  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2024  }
2025 
2026  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2027  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2028  Results.push_back(DIV);
2029  Results.push_back(REM);
2030 }
2031 
2032 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2033  SelectionDAG &DAG) const {
2034  SDLoc DL(Op);
2035  EVT VT = Op.getValueType();
2036 
2037  if (VT == MVT::i64) {
2039  LowerUDIVREM64(Op, DAG, Results);
2040  return DAG.getMergeValues(Results, DL);
2041  }
2042 
2043  if (VT == MVT::i32) {
2044  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2045  return Res;
2046  }
2047 
2048  SDValue X = Op.getOperand(0);
2049  SDValue Y = Op.getOperand(1);
2050 
2051  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2052  // algorithm used here.
2053 
2054  // Initial estimate of inv(y).
2055  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2056 
2057  // One round of UNR.
2058  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2059  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2060  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2061  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2062 
2063  // Quotient/remainder estimate.
2064  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2065  SDValue R =
2066  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2067 
2068  // First quotient/remainder refinement.
2069  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2070  SDValue One = DAG.getConstant(1, DL, VT);
2071  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2072  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2073  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2074  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2075  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2076 
2077  // Second quotient/remainder refinement.
2078  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2079  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2080  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2081  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2082  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2083 
2084  return DAG.getMergeValues({Q, R}, DL);
2085 }
2086 
2088  SelectionDAG &DAG) const {
2089  SDLoc DL(Op);
2090  EVT VT = Op.getValueType();
2091 
2092  SDValue LHS = Op.getOperand(0);
2093  SDValue RHS = Op.getOperand(1);
2094 
2095  SDValue Zero = DAG.getConstant(0, DL, VT);
2096  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2097 
2098  if (VT == MVT::i32) {
2099  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2100  return Res;
2101  }
2102 
2103  if (VT == MVT::i64 &&
2104  DAG.ComputeNumSignBits(LHS) > 32 &&
2105  DAG.ComputeNumSignBits(RHS) > 32) {
2106  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2107 
2108  //HiLo split
2109  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2110  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2111  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2112  LHS_Lo, RHS_Lo);
2113  SDValue Res[2] = {
2114  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2115  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2116  };
2117  return DAG.getMergeValues(Res, DL);
2118  }
2119 
2120  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2121  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2122  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2123  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2124 
2125  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2126  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2127 
2128  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2129  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2130 
2131  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2132  SDValue Rem = Div.getValue(1);
2133 
2134  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2135  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2136 
2137  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2138  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2139 
2140  SDValue Res[2] = {
2141  Div,
2142  Rem
2143  };
2144  return DAG.getMergeValues(Res, DL);
2145 }
2146 
2147 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2149  SDLoc SL(Op);
2150  EVT VT = Op.getValueType();
2151  auto Flags = Op->getFlags();
2152  SDValue X = Op.getOperand(0);
2153  SDValue Y = Op.getOperand(1);
2154 
2155  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2156  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2157  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2158  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2159  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2160 }
2161 
2163  SDLoc SL(Op);
2164  SDValue Src = Op.getOperand(0);
2165 
2166  // result = trunc(src)
2167  // if (src > 0.0 && src != result)
2168  // result += 1.0
2169 
2170  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2171 
2172  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2173  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2174 
2175  EVT SetCCVT =
2177 
2178  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2179  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2180  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2181 
2182  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2183  // TODO: Should this propagate fast-math-flags?
2184  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2185 }
2186 
2188  SelectionDAG &DAG) {
2189  const unsigned FractBits = 52;
2190  const unsigned ExpBits = 11;
2191 
2192  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2193  Hi,
2194  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2195  DAG.getConstant(ExpBits, SL, MVT::i32));
2196  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2197  DAG.getConstant(1023, SL, MVT::i32));
2198 
2199  return Exp;
2200 }
2201 
2203  SDLoc SL(Op);
2204  SDValue Src = Op.getOperand(0);
2205 
2206  assert(Op.getValueType() == MVT::f64);
2207 
2208  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2209 
2210  // Extract the upper half, since this is where we will find the sign and
2211  // exponent.
2212  SDValue Hi = getHiHalf64(Src, DAG);
2213 
2214  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2215 
2216  const unsigned FractBits = 52;
2217 
2218  // Extract the sign bit.
2219  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2220  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2221 
2222  // Extend back to 64-bits.
2223  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2224  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2225 
2226  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2227  const SDValue FractMask
2228  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2229 
2230  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2231  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2232  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2233 
2234  EVT SetCCVT =
2236 
2237  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2238 
2239  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2240  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2241 
2242  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2243  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2244 
2245  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2246 }
2247 
2249  SDLoc SL(Op);
2250  SDValue Src = Op.getOperand(0);
2251 
2252  assert(Op.getValueType() == MVT::f64);
2253 
2254  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2255  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2256  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2257 
2258  // TODO: Should this propagate fast-math-flags?
2259 
2260  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2261  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2262 
2263  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2264 
2265  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2266  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2267 
2268  EVT SetCCVT =
2270  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2271 
2272  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2273 }
2274 
2276  // FNEARBYINT and FRINT are the same, except in their handling of FP
2277  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2278  // rint, so just treat them as equivalent.
2279  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2280 }
2281 
2282 // XXX - May require not supporting f32 denormals?
2283 
2284 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2285 // compare and vselect end up producing worse code than scalarizing the whole
2286 // operation.
2288  SDLoc SL(Op);
2289  SDValue X = Op.getOperand(0);
2290  EVT VT = Op.getValueType();
2291 
2292  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2293 
2294  // TODO: Should this propagate fast-math-flags?
2295 
2296  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2297 
2298  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2299 
2300  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2301  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2302  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2303 
2304  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2305 
2306  EVT SetCCVT =
2307  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2308 
2309  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2310 
2311  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2312 
2313  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2314 }
2315 
2317  SDLoc SL(Op);
2318  SDValue Src = Op.getOperand(0);
2319 
2320  // result = trunc(src);
2321  // if (src < 0.0 && src != result)
2322  // result += -1.0.
2323 
2324  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2325 
2326  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2327  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2328 
2329  EVT SetCCVT =
2331 
2332  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2333  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2334  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2335 
2336  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2337  // TODO: Should this propagate fast-math-flags?
2338  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2339 }
2340 
2342  double Log2BaseInverted) const {
2343  EVT VT = Op.getValueType();
2344 
2345  SDLoc SL(Op);
2346  SDValue Operand = Op.getOperand(0);
2347  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2348  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2349 
2350  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2351 }
2352 
2353 // exp2(M_LOG2E_F * f);
2355  EVT VT = Op.getValueType();
2356  SDLoc SL(Op);
2357  SDValue Src = Op.getOperand(0);
2358 
2359  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2360  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2361  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2362 }
2363 
2364 static bool isCtlzOpc(unsigned Opc) {
2365  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2366 }
2367 
2368 static bool isCttzOpc(unsigned Opc) {
2369  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2370 }
2371 
2373  SDLoc SL(Op);
2374  SDValue Src = Op.getOperand(0);
2375 
2376  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2377  bool Ctlz = isCtlzOpc(Op.getOpcode());
2378  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2379 
2380  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2381  Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2382 
2383  if (Src.getValueType() == MVT::i32) {
2384  // (ctlz hi:lo) -> (umin (ffbh src), 32)
2385  // (cttz hi:lo) -> (umin (ffbl src), 32)
2386  // (ctlz_zero_undef src) -> (ffbh src)
2387  // (cttz_zero_undef src) -> (ffbl src)
2388  SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2389  if (!ZeroUndef) {
2390  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2391  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2392  }
2393  return NewOpr;
2394  }
2395 
2396  SDValue Lo, Hi;
2397  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2398 
2399  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2400  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2401 
2402  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2403  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2404  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2405  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2406 
2407  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2408  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2409  if (Ctlz)
2410  OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2411  else
2412  OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2413 
2414  SDValue NewOpr;
2415  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2416  if (!ZeroUndef) {
2417  const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2418  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2419  }
2420 
2421  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2422 }
2423 
2425  bool Signed) const {
2426  // The regular method converting a 64-bit integer to float roughly consists of
2427  // 2 steps: normalization and rounding. In fact, after normalization, the
2428  // conversion from a 64-bit integer to a float is essentially the same as the
2429  // one from a 32-bit integer. The only difference is that it has more
2430  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2431  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2432  // converted into the correct float number. The basic steps for the unsigned
2433  // conversion are illustrated in the following pseudo code:
2434  //
2435  // f32 uitofp(i64 u) {
2436  // i32 hi, lo = split(u);
2437  // // Only count the leading zeros in hi as we have native support of the
2438  // // conversion from i32 to f32. If hi is all 0s, the conversion is
2439  // // reduced to a 32-bit one automatically.
2440  // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2441  // u <<= shamt;
2442  // hi, lo = split(u);
2443  // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2444  // // convert it as a 32-bit integer and scale the result back.
2445  // return uitofp(hi) * 2^(32 - shamt);
2446  // }
2447  //
2448  // The signed one follows the same principle but uses 'ffbh_i32' to count its
2449  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2450  // converted instead followed by negation based its sign bit.
2451 
2452  SDLoc SL(Op);
2453  SDValue Src = Op.getOperand(0);
2454 
2455  SDValue Lo, Hi;
2456  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2457  SDValue Sign;
2458  SDValue ShAmt;
2459  if (Signed && Subtarget->isGCN()) {
2460  // We also need to consider the sign bit in Lo if Hi has just sign bits,
2461  // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2462  // account. That is, the maximal shift is
2463  // - 32 if Lo and Hi have opposite signs;
2464  // - 33 if Lo and Hi have the same sign.
2465  //
2466  // Or, MaxShAmt = 33 + OppositeSign, where
2467  //
2468  // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2469  // - -1 if Lo and Hi have opposite signs; and
2470  // - 0 otherwise.
2471  //
2472  // All in all, ShAmt is calculated as
2473  //
2474  // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2475  //
2476  // or
2477  //
2478  // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2479  //
2480  // to reduce the critical path.
2481  SDValue OppositeSign = DAG.getNode(
2482  ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2483  DAG.getConstant(31, SL, MVT::i32));
2484  SDValue MaxShAmt =
2485  DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2486  OppositeSign);
2487  // Count the leading sign bits.
2488  ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2489  // Different from unsigned conversion, the shift should be one bit less to
2490  // preserve the sign bit.
2491  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2492  DAG.getConstant(1, SL, MVT::i32));
2493  ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2494  } else {
2495  if (Signed) {
2496  // Without 'ffbh_i32', only leading zeros could be counted. Take the
2497  // absolute value first.
2498  Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2499  DAG.getConstant(63, SL, MVT::i64));
2500  SDValue Abs =
2501  DAG.getNode(ISD::XOR, SL, MVT::i64,
2502  DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2503  std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2504  }
2505  // Count the leading zeros.
2506  ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2507  // The shift amount for signed integers is [0, 32].
2508  }
2509  // Normalize the given 64-bit integer.
2510  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2511  // Split it again.
2512  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2513  // Calculate the adjust bit for rounding.
2514  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2515  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2516  DAG.getConstant(1, SL, MVT::i32), Lo);
2517  // Get the 32-bit normalized integer.
2518  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2519  // Convert the normalized 32-bit integer into f32.
2520  unsigned Opc =
2521  (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2522  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2523 
2524  // Finally, need to scale back the converted floating number as the original
2525  // 64-bit integer is converted as a 32-bit one.
2526  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2527  ShAmt);
2528  // On GCN, use LDEXP directly.
2529  if (Subtarget->isGCN())
2530  return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2531 
2532  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2533  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2534  // exponent is enough to avoid overflowing into the sign bit.
2535  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2536  DAG.getConstant(23, SL, MVT::i32));
2537  SDValue IVal =
2538  DAG.getNode(ISD::ADD, SL, MVT::i32,
2539  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2540  if (Signed) {
2541  // Set the sign bit.
2542  Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2543  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2544  DAG.getConstant(31, SL, MVT::i32));
2545  IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2546  }
2547  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2548 }
2549 
2551  bool Signed) const {
2552  SDLoc SL(Op);
2553  SDValue Src = Op.getOperand(0);
2554 
2555  SDValue Lo, Hi;
2556  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2557 
2559  SL, MVT::f64, Hi);
2560 
2561  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2562 
2563  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2564  DAG.getConstant(32, SL, MVT::i32));
2565  // TODO: Should this propagate fast-math-flags?
2566  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2567 }
2568 
2570  SelectionDAG &DAG) const {
2571  // TODO: Factor out code common with LowerSINT_TO_FP.
2572  EVT DestVT = Op.getValueType();
2573  SDValue Src = Op.getOperand(0);
2574  EVT SrcVT = Src.getValueType();
2575 
2576  if (SrcVT == MVT::i16) {
2577  if (DestVT == MVT::f16)
2578  return Op;
2579  SDLoc DL(Op);
2580 
2581  // Promote src to i32
2583  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2584  }
2585 
2586  assert(SrcVT == MVT::i64 && "operation should be legal");
2587 
2588  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2589  SDLoc DL(Op);
2590 
2591  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2592  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2593  SDValue FPRound =
2594  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2595 
2596  return FPRound;
2597  }
2598 
2599  if (DestVT == MVT::f32)
2600  return LowerINT_TO_FP32(Op, DAG, false);
2601 
2602  assert(DestVT == MVT::f64);
2603  return LowerINT_TO_FP64(Op, DAG, false);
2604 }
2605 
2607  SelectionDAG &DAG) const {
2608  EVT DestVT = Op.getValueType();
2609 
2610  SDValue Src = Op.getOperand(0);
2611  EVT SrcVT = Src.getValueType();
2612 
2613  if (SrcVT == MVT::i16) {
2614  if (DestVT == MVT::f16)
2615  return Op;
2616 
2617  SDLoc DL(Op);
2618  // Promote src to i32
2620  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2621  }
2622 
2623  assert(SrcVT == MVT::i64 && "operation should be legal");
2624 
2625  // TODO: Factor out code common with LowerUINT_TO_FP.
2626 
2627  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2628  SDLoc DL(Op);
2629  SDValue Src = Op.getOperand(0);
2630 
2631  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2632  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2633  SDValue FPRound =
2634  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2635 
2636  return FPRound;
2637  }
2638 
2639  if (DestVT == MVT::f32)
2640  return LowerINT_TO_FP32(Op, DAG, true);
2641 
2642  assert(DestVT == MVT::f64);
2643  return LowerINT_TO_FP64(Op, DAG, true);
2644 }
2645 
2647  bool Signed) const {
2648  SDLoc SL(Op);
2649 
2650  SDValue Src = Op.getOperand(0);
2651  EVT SrcVT = Src.getValueType();
2652 
2653  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2654 
2655  // The basic idea of converting a floating point number into a pair of 32-bit
2656  // integers is illustrated as follows:
2657  //
2658  // tf := trunc(val);
2659  // hif := floor(tf * 2^-32);
2660  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2661  // hi := fptoi(hif);
2662  // lo := fptoi(lof);
2663  //
2664  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2665  SDValue Sign;
2666  if (Signed && SrcVT == MVT::f32) {
2667  // However, a 32-bit floating point number has only 23 bits mantissa and
2668  // it's not enough to hold all the significant bits of `lof` if val is
2669  // negative. To avoid the loss of precision, We need to take the absolute
2670  // value after truncating and flip the result back based on the original
2671  // signedness.
2672  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2673  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2674  DAG.getConstant(31, SL, MVT::i32));
2675  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2676  }
2677 
2678  SDValue K0, K1;
2679  if (SrcVT == MVT::f64) {
2680  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2681  SL, SrcVT);
2682  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2683  SL, SrcVT);
2684  } else {
2685  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2686  SrcVT);
2687  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2688  SrcVT);
2689  }
2690  // TODO: Should this propagate fast-math-flags?
2691  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2692 
2693  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2694 
2695  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2696 
2697  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2698  : ISD::FP_TO_UINT,
2699  SL, MVT::i32, FloorMul);
2700  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2701 
2702  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2703  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2704 
2705  if (Signed && SrcVT == MVT::f32) {
2706  assert(Sign);
2707  // Flip the result based on the signedness, which is either all 0s or 1s.
2708  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2709  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2710  // r := xor(r, sign) - sign;
2711  Result =
2712  DAG.getNode(ISD::SUB, SL, MVT::i64,
2713  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2714  }
2715 
2716  return Result;
2717 }
2718 
2720  SDLoc DL(Op);
2721  SDValue N0 = Op.getOperand(0);
2722 
2723  // Convert to target node to get known bits
2724  if (N0.getValueType() == MVT::f32)
2725  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2726 
2727  if (getTargetMachine().Options.UnsafeFPMath) {
2728  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2729  return SDValue();
2730  }
2731 
2733 
2734  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2735  const unsigned ExpMask = 0x7ff;
2736  const unsigned ExpBiasf64 = 1023;
2737  const unsigned ExpBiasf16 = 15;
2738  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2739  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2740  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2741  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2742  DAG.getConstant(32, DL, MVT::i64));
2743  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2744  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2745  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2746  DAG.getConstant(20, DL, MVT::i64));
2747  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2748  DAG.getConstant(ExpMask, DL, MVT::i32));
2749  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2750  // add the f16 bias (15) to get the biased exponent for the f16 format.
2751  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2752  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2753 
2754  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2755  DAG.getConstant(8, DL, MVT::i32));
2756  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2757  DAG.getConstant(0xffe, DL, MVT::i32));
2758 
2759  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2760  DAG.getConstant(0x1ff, DL, MVT::i32));
2761  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2762 
2763  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2764  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2765 
2766  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2767  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2768  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2769  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2770 
2771  // N = M | (E << 12);
2772  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2773  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2774  DAG.getConstant(12, DL, MVT::i32)));
2775 
2776  // B = clamp(1-E, 0, 13);
2777  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2778  One, E);
2779  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2780  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2781  DAG.getConstant(13, DL, MVT::i32));
2782 
2783  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2784  DAG.getConstant(0x1000, DL, MVT::i32));
2785 
2786  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2787  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2788  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2789  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2790 
2791  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2792  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2793  DAG.getConstant(0x7, DL, MVT::i32));
2794  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2795  DAG.getConstant(2, DL, MVT::i32));
2796  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2797  One, Zero, ISD::SETEQ);
2798  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2799  One, Zero, ISD::SETGT);
2800  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2801  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2802 
2803  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2804  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2805  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2806  I, V, ISD::SETEQ);
2807 
2808  // Extract the sign bit.
2809  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2810  DAG.getConstant(16, DL, MVT::i32));
2811  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2812  DAG.getConstant(0x8000, DL, MVT::i32));
2813 
2814  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2815  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2816 }
2817 
2819  SelectionDAG &DAG) const {
2820  SDValue Src = Op.getOperand(0);
2821  unsigned OpOpcode = Op.getOpcode();
2822  EVT SrcVT = Src.getValueType();
2823  EVT DestVT = Op.getValueType();
2824 
2825  // Will be selected natively
2826  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2827  return Op;
2828 
2829  // Promote i16 to i32
2830  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2831  SDLoc DL(Op);
2832 
2833  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2834  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2835  }
2836 
2837  if (SrcVT == MVT::f16 ||
2838  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2839  SDLoc DL(Op);
2840 
2841  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2842  unsigned Ext =
2844  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2845  }
2846 
2847  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2848  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2849 
2850  return SDValue();
2851 }
2852 
2854  SelectionDAG &DAG) const {
2855  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2856  MVT VT = Op.getSimpleValueType();
2857  MVT ScalarVT = VT.getScalarType();
2858 
2859  assert(VT.isVector());
2860 
2861  SDValue Src = Op.getOperand(0);
2862  SDLoc DL(Op);
2863 
2864  // TODO: Don't scalarize on Evergreen?
2865  unsigned NElts = VT.getVectorNumElements();
2867  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2868 
2869  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2870  for (unsigned I = 0; I < NElts; ++I)
2871  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2872 
2873  return DAG.getBuildVector(VT, DL, Args);
2874 }
2875 
2876 //===----------------------------------------------------------------------===//
2877 // Custom DAG optimizations
2878 //===----------------------------------------------------------------------===//
2879 
2880 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2881  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2882 }
2883 
2884 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2885  EVT VT = Op.getValueType();
2886  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2887  // as unsigned 24-bit values.
2889 }
2890 
2893  SelectionDAG &DAG = DCI.DAG;
2894  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2895  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2896 
2897  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2898  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2899  unsigned NewOpcode = Node24->getOpcode();
2900  if (IsIntrin) {
2901  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2902  NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2904  }
2905 
2906  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2907 
2908  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2909  // the operands to have other uses, but will only perform simplifications that
2910  // involve bypassing some nodes for this user.
2911  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2912  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2913  if (DemandedLHS || DemandedRHS)
2914  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2915  DemandedLHS ? DemandedLHS : LHS,
2916  DemandedRHS ? DemandedRHS : RHS);
2917 
2918  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2919  // operands if this node is the only user.
2920  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2921  return SDValue(Node24, 0);
2922  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2923  return SDValue(Node24, 0);
2924 
2925  return SDValue();
2926 }
2927 
2928 template <typename IntTy>
2930  uint32_t Width, const SDLoc &DL) {
2931  if (Width + Offset < 32) {
2932  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2933  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2934  return DAG.getConstant(Result, DL, MVT::i32);
2935  }
2936 
2937  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2938 }
2939 
2940 static bool hasVolatileUser(SDNode *Val) {
2941  for (SDNode *U : Val->uses()) {
2942  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2943  if (M->isVolatile())
2944  return true;
2945  }
2946  }
2947 
2948  return false;
2949 }
2950 
2952  // i32 vectors are the canonical memory type.
2953  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2954  return false;
2955 
2956  if (!VT.isByteSized())
2957  return false;
2958 
2959  unsigned Size = VT.getStoreSize();
2960 
2961  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2962  return false;
2963 
2964  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2965  return false;
2966 
2967  return true;
2968 }
2969 
2970 // Replace load of an illegal type with a store of a bitcast to a friendlier
2971 // type.
2973  DAGCombinerInfo &DCI) const {
2974  if (!DCI.isBeforeLegalize())
2975  return SDValue();
2976 
2977  LoadSDNode *LN = cast<LoadSDNode>(N);
2978  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2979  return SDValue();
2980 
2981  SDLoc SL(N);
2982  SelectionDAG &DAG = DCI.DAG;
2983  EVT VT = LN->getMemoryVT();
2984 
2985  unsigned Size = VT.getStoreSize();
2986  Align Alignment = LN->getAlign();
2987  if (Alignment < Size && isTypeLegal(VT)) {
2988  bool IsFast;
2989  unsigned AS = LN->getAddressSpace();
2990 
2991  // Expand unaligned loads earlier than legalization. Due to visitation order
2992  // problems during legalization, the emitted instructions to pack and unpack
2993  // the bytes again are not eliminated in the case of an unaligned copy.
2995  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2996  SDValue Ops[2];
2997 
2998  if (VT.isVector())
2999  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
3000  else
3001  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3002 
3003  return DAG.getMergeValues(Ops, SDLoc(N));
3004  }
3005 
3006  if (!IsFast)
3007  return SDValue();
3008  }
3009 
3010  if (!shouldCombineMemoryType(VT))
3011  return SDValue();
3012 
3013  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3014 
3015  SDValue NewLoad
3016  = DAG.getLoad(NewVT, SL, LN->getChain(),
3017  LN->getBasePtr(), LN->getMemOperand());
3018 
3019  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3020  DCI.CombineTo(N, BC, NewLoad.getValue(1));
3021  return SDValue(N, 0);
3022 }
3023 
3024 // Replace store of an illegal type with a store of a bitcast to a friendlier
3025 // type.
3027  DAGCombinerInfo &DCI) const {
3028  if (!DCI.isBeforeLegalize())
3029  return SDValue();
3030 
3031  StoreSDNode *SN = cast<StoreSDNode>(N);
3032  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3033  return SDValue();
3034 
3035  EVT VT = SN->getMemoryVT();
3036  unsigned Size = VT.getStoreSize();
3037 
3038  SDLoc SL(N);
3039  SelectionDAG &DAG = DCI.DAG;
3040  Align Alignment = SN->getAlign();
3041  if (Alignment < Size && isTypeLegal(VT)) {
3042  bool IsFast;
3043  unsigned AS = SN->getAddressSpace();
3044 
3045  // Expand unaligned stores earlier than legalization. Due to visitation
3046  // order problems during legalization, the emitted instructions to pack and
3047  // unpack the bytes again are not eliminated in the case of an unaligned
3048  // copy.
3050  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3051  if (VT.isVector())
3052  return scalarizeVectorStore(SN, DAG);
3053 
3054  return expandUnalignedStore(SN, DAG);
3055  }
3056 
3057  if (!IsFast)
3058  return SDValue();
3059  }
3060 
3061  if (!shouldCombineMemoryType(VT))
3062  return SDValue();
3063 
3064  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3065  SDValue Val = SN->getValue();
3066 
3067  //DCI.AddToWorklist(Val.getNode());
3068 
3069  bool OtherUses = !Val.hasOneUse();
3070  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3071  if (OtherUses) {
3072  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3073  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3074  }
3075 
3076  return DAG.getStore(SN->getChain(), SL, CastVal,
3077  SN->getBasePtr(), SN->getMemOperand());
3078 }
3079 
3080 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3081 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3082 // issues.
3084  DAGCombinerInfo &DCI) const {
3085  SelectionDAG &DAG = DCI.DAG;
3086  SDValue N0 = N->getOperand(0);
3087 
3088  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3089  // (vt2 (truncate (assertzext vt0:x, vt1)))
3090  if (N0.getOpcode() == ISD::TRUNCATE) {
3091  SDValue N1 = N->getOperand(1);
3092  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3093  SDLoc SL(N);
3094 
3095  SDValue Src = N0.getOperand(0);
3096  EVT SrcVT = Src.getValueType();
3097  if (SrcVT.bitsGE(ExtVT)) {
3098  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3099  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3100  }
3101  }
3102 
3103  return SDValue();
3104 }
3105 
3107  SDNode *N, DAGCombinerInfo &DCI) const {
3108  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3109  switch (IID) {
3110  case Intrinsic::amdgcn_mul_i24:
3111  case Intrinsic::amdgcn_mul_u24:
3112  return simplifyMul24(N, DCI);
3113  case Intrinsic::amdgcn_fract:
3114  case Intrinsic::amdgcn_rsq:
3115  case Intrinsic::amdgcn_rcp_legacy:
3116  case Intrinsic::amdgcn_rsq_legacy:
3117  case Intrinsic::amdgcn_rsq_clamp:
3118  case Intrinsic::amdgcn_ldexp: {
3119  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3120  SDValue Src = N->getOperand(1);
3121  return Src.isUndef() ? Src : SDValue();
3122  }
3123  default:
3124  return SDValue();
3125  }
3126 }
3127 
3128 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3129 /// binary operation \p Opc to it with the corresponding constant operands.
3131  DAGCombinerInfo &DCI, const SDLoc &SL,
3132  unsigned Opc, SDValue LHS,
3133  uint32_t ValLo, uint32_t ValHi) const {
3134  SelectionDAG &DAG = DCI.DAG;
3135  SDValue Lo, Hi;
3136  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3137 
3138  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3139  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3140 
3141  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3142  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3143 
3144  // Re-visit the ands. It's possible we eliminated one of them and it could
3145  // simplify the vector.
3146  DCI.AddToWorklist(Lo.getNode());
3147  DCI.AddToWorklist(Hi.getNode());
3148 
3149  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3150  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3151 }
3152 
3154  DAGCombinerInfo &DCI) const {
3155  EVT VT = N->getValueType(0);
3156 
3157  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3158  if (!RHS)
3159  return SDValue();
3160 
3161  SDValue LHS = N->getOperand(0);
3162  unsigned RHSVal = RHS->getZExtValue();
3163  if (!RHSVal)
3164  return LHS;
3165 
3166  SDLoc SL(N);
3167  SelectionDAG &DAG = DCI.DAG;
3168 
3169  switch (LHS->getOpcode()) {
3170  default:
3171  break;
3172  case ISD::ZERO_EXTEND:
3173  case ISD::SIGN_EXTEND:
3174  case ISD::ANY_EXTEND: {
3175  SDValue X = LHS->getOperand(0);
3176 
3177  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3179  // Prefer build_vector as the canonical form if packed types are legal.
3180  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3181  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3182  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3183  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3184  }
3185 
3186  // shl (ext x) => zext (shl x), if shift does not overflow int
3187  if (VT != MVT::i64)
3188  break;
3189  KnownBits Known = DAG.computeKnownBits(X);
3190  unsigned LZ = Known.countMinLeadingZeros();
3191  if (LZ < RHSVal)
3192  break;
3193  EVT XVT = X.getValueType();
3194  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3195  return DAG.getZExtOrTrunc(Shl, SL, VT);
3196  }
3197  }
3198 
3199  if (VT != MVT::i64)
3200  return SDValue();
3201 
3202  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3203 
3204  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3205  // common case, splitting this into a move and a 32-bit shift is faster and
3206  // the same code size.
3207  if (RHSVal < 32)
3208  return SDValue();
3209 
3210  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3211 
3212  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3213  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3214 
3215  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3216 
3217  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3218  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3219 }
3220 
3222  DAGCombinerInfo &DCI) const {
3223  if (N->getValueType(0) != MVT::i64)
3224  return SDValue();
3225 
3226  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3227  if (!RHS)
3228  return SDValue();
3229 
3230  SelectionDAG &DAG = DCI.DAG;
3231  SDLoc SL(N);
3232  unsigned RHSVal = RHS->getZExtValue();
3233 
3234  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3235  if (RHSVal == 32) {
3236  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3237  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3238  DAG.getConstant(31, SL, MVT::i32));
3239 
3240  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3241  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3242  }
3243 
3244  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3245  if (RHSVal == 63) {
3246  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3247  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3248  DAG.getConstant(31, SL, MVT::i32));
3249  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3250  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3251  }
3252 
3253  return SDValue();
3254 }
3255 
3257  DAGCombinerInfo &DCI) const {
3258  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3259  if (!RHS)
3260  return SDValue();
3261 
3262  EVT VT = N->getValueType(0);
3263  SDValue LHS = N->getOperand(0);
3264  unsigned ShiftAmt = RHS->getZExtValue();
3265  SelectionDAG &DAG = DCI.DAG;
3266  SDLoc SL(N);
3267 
3268  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3269  // this improves the ability to match BFE patterns in isel.
3270  if (LHS.getOpcode() == ISD::AND) {
3271  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3272  if (Mask->getAPIntValue().isShiftedMask() &&
3273  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3274  return DAG.getNode(
3275  ISD::AND, SL, VT,
3276  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3277  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3278  }
3279  }
3280  }
3281 
3282  if (VT != MVT::i64)
3283  return SDValue();
3284 
3285  if (ShiftAmt < 32)
3286  return SDValue();
3287 
3288  // srl i64:x, C for C >= 32
3289  // =>
3290  // build_pair (srl hi_32(x), C - 32), 0
3291  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3292 
3293  SDValue Hi = getHiHalf64(LHS, DAG);
3294 
3295  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3296  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3297 
3298  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3299 
3300  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3301 }
3302 
3304  SDNode *N, DAGCombinerInfo &DCI) const {
3305  SDLoc SL(N);
3306  SelectionDAG &DAG = DCI.DAG;
3307  EVT VT = N->getValueType(0);
3308  SDValue Src = N->getOperand(0);
3309 
3310  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3311  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3312  SDValue Vec = Src.getOperand(0);
3313  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3314  SDValue Elt0 = Vec.getOperand(0);
3315  EVT EltVT = Elt0.getValueType();
3316  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3317  if (EltVT.isFloatingPoint()) {
3318  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3319  EltVT.changeTypeToInteger(), Elt0);
3320  }
3321 
3322  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3323  }
3324  }
3325  }
3326 
3327  // Equivalent of above for accessing the high element of a vector as an
3328  // integer operation.
3329  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3330  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3331  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3332  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3333  SDValue BV = stripBitcast(Src.getOperand(0));
3334  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3335  BV.getValueType().getVectorNumElements() == 2) {
3336  SDValue SrcElt = BV.getOperand(1);
3337  EVT SrcEltVT = SrcElt.getValueType();
3338  if (SrcEltVT.isFloatingPoint()) {
3339  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3340  SrcEltVT.changeTypeToInteger(), SrcElt);
3341  }
3342 
3343  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3344  }
3345  }
3346  }
3347  }
3348 
3349  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3350  //
3351  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3352  // i16 (trunc (srl (i32 (trunc x), K)))
3353  if (VT.getScalarSizeInBits() < 32) {
3354  EVT SrcVT = Src.getValueType();
3355  if (SrcVT.getScalarSizeInBits() > 32 &&
3356  (Src.getOpcode() == ISD::SRL ||
3357  Src.getOpcode() == ISD::SRA ||
3358  Src.getOpcode() == ISD::SHL)) {
3359  SDValue Amt = Src.getOperand(1);
3360  KnownBits Known = DAG.computeKnownBits(Amt);
3361  unsigned Size = VT.getScalarSizeInBits();
3362  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3363  (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3364  EVT MidVT = VT.isVector() ?
3367 
3368  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3369  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3370  Src.getOperand(0));
3371  DCI.AddToWorklist(Trunc.getNode());
3372 
3373  if (Amt.getValueType() != NewShiftVT) {
3374  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3375  DCI.AddToWorklist(Amt.getNode());
3376  }
3377 
3378  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3379  Trunc, Amt);
3380  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3381  }
3382  }
3383  }
3384 
3385  return SDValue();
3386 }
3387 
3388 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3389 // instructions. If we only match on the legalized i64 mul expansion,
3390 // SimplifyDemandedBits will be unable to remove them because there will be
3391 // multiple uses due to the separate mul + mulh[su].
3392 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3393  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3394  if (Size <= 32) {
3395  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3396  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3397  }
3398 
3399  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3400  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3401 
3402  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3403  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3404 
3405  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3406 }
3407 
3409  DAGCombinerInfo &DCI) const {
3410  EVT VT = N->getValueType(0);
3411 
3412  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3413  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3414  // unnecessarily). isDivergent() is used as an approximation of whether the
3415  // value is in an SGPR.
3416  if (!N->isDivergent())
3417  return SDValue();
3418 
3419  unsigned Size = VT.getSizeInBits();
3420  if (VT.isVector() || Size > 64)
3421  return SDValue();
3422 
3423  // There are i16 integer mul/mad.
3424  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3425  return SDValue();
3426 
3427  SelectionDAG &DAG = DCI.DAG;
3428  SDLoc DL(N);
3429 
3430  SDValue N0 = N->getOperand(0);
3431  SDValue N1 = N->getOperand(1);
3432 
3433  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3434  // in the source into any_extends if the result of the mul is truncated. Since
3435  // we can assume the high bits are whatever we want, use the underlying value
3436  // to avoid the unknown high bits from interfering.
3437  if (N0.getOpcode() == ISD::ANY_EXTEND)
3438  N0 = N0.getOperand(0);
3439 
3440  if (N1.getOpcode() == ISD::ANY_EXTEND)
3441  N1 = N1.getOperand(0);
3442 
3443  SDValue Mul;
3444 
3445  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3446  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3447  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3448  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3449  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3450  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3451  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3452  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3453  } else {
3454  return SDValue();
3455  }
3456 
3457  // We need to use sext even for MUL_U24, because MUL_U24 is used
3458  // for signed multiply of 8 and 16-bit types.
3459  return DAG.getSExtOrTrunc(Mul, DL, VT);
3460 }
3461 
3463  DAGCombinerInfo &DCI) const {
3464  EVT VT = N->getValueType(0);
3465 
3466  if (!Subtarget->hasMulI24() || VT.isVector())
3467  return SDValue();
3468 
3469  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3470  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3471  // unnecessarily). isDivergent() is used as an approximation of whether the
3472  // value is in an SGPR.
3473  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3474  // valu op anyway)
3475  if (Subtarget->hasSMulHi() && !N->isDivergent())
3476  return SDValue();
3477 
3478  SelectionDAG &DAG = DCI.DAG;
3479  SDLoc DL(N);
3480 
3481  SDValue N0 = N->getOperand(0);
3482  SDValue N1 = N->getOperand(1);
3483 
3484  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3485  return SDValue();
3486 
3487  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3488  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3489 
3490  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3491  DCI.AddToWorklist(Mulhi.getNode());
3492  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3493 }
3494 
3496  DAGCombinerInfo &DCI) const {
3497  EVT VT = N->getValueType(0);
3498 
3499  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3500  return SDValue();
3501 
3502  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3503  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3504  // unnecessarily). isDivergent() is used as an approximation of whether the
3505  // value is in an SGPR.
3506  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3507  // valu op anyway)
3508  if (Subtarget->hasSMulHi() && !N->isDivergent())
3509  return SDValue();
3510 
3511  SelectionDAG &DAG = DCI.DAG;
3512  SDLoc DL(N);
3513 
3514  SDValue N0 = N->getOperand(0);
3515  SDValue N1 = N->getOperand(1);
3516 
3517  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3518  return SDValue();
3519 
3520  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3521  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3522 
3523  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3524  DCI.AddToWorklist(Mulhi.getNode());
3525  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3526 }
3527 
3528 static bool isNegativeOne(SDValue Val) {
3529  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3530  return C->isAllOnes();
3531  return false;
3532 }
3533 
3534 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3535  SDValue Op,
3536  const SDLoc &DL,
3537  unsigned Opc) const {
3538  EVT VT = Op.getValueType();
3539  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3540  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3541  LegalVT != MVT::i16))
3542  return SDValue();
3543 
3544  if (VT != MVT::i32)
3546 
3547  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3548  if (VT != MVT::i32)
3549  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3550 
3551  return FFBX;
3552 }
3553 
3554 // The native instructions return -1 on 0 input. Optimize out a select that
3555 // produces -1 on 0.
3556 //
3557 // TODO: If zero is not undef, we could also do this if the output is compared
3558 // against the bitwidth.
3559 //
3560 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3562  SDValue LHS, SDValue RHS,
3563  DAGCombinerInfo &DCI) const {
3564  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3565  if (!CmpRhs || !CmpRhs->isZero())
3566  return SDValue();
3567 
3568  SelectionDAG &DAG = DCI.DAG;
3569  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3570  SDValue CmpLHS = Cond.getOperand(0);
3571 
3572  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3573  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3574  if (CCOpcode == ISD::SETEQ &&
3575  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3576  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3577  unsigned Opc =
3579  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3580  }
3581 
3582  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3583  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3584  if (CCOpcode == ISD::SETNE &&
3585  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3586  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3587  unsigned Opc =
3589 
3590  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3591  }
3592 
3593  return SDValue();
3594 }
3595 
3597  unsigned Op,
3598  const SDLoc &SL,
3599  SDValue Cond,
3600  SDValue N1,
3601  SDValue N2) {
3602  SelectionDAG &DAG = DCI.DAG;
3603  EVT VT = N1.getValueType();
3604 
3605  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3606  N1.getOperand(0), N2.getOperand(0));
3607  DCI.AddToWorklist(NewSelect.getNode());
3608  return DAG.getNode(Op, SL, VT, NewSelect);
3609 }
3610 
3611 // Pull a free FP operation out of a select so it may fold into uses.
3612 //
3613 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3614 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3615 //
3616 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3617 // select c, (fabs x), +k -> fabs (select c, x, k)
3619  SDValue N) {
3620  SelectionDAG &DAG = DCI.DAG;
3621  SDValue Cond = N.getOperand(0);
3622  SDValue LHS = N.getOperand(1);
3623  SDValue RHS = N.getOperand(2);
3624 
3625  EVT VT = N.getValueType();
3626  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3627  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3628  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3629  SDLoc(N), Cond, LHS, RHS);
3630  }
3631 
3632  bool Inv = false;
3633  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3634  std::swap(LHS, RHS);
3635  Inv = true;
3636  }
3637 
3638  // TODO: Support vector constants.
3639  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3640  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3641  SDLoc SL(N);
3642  // If one side is an fneg/fabs and the other is a constant, we can push the
3643  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3644  SDValue NewLHS = LHS.getOperand(0);
3645  SDValue NewRHS = RHS;
3646 
3647  // Careful: if the neg can be folded up, don't try to pull it back down.
3648  bool ShouldFoldNeg = true;
3649 
3650  if (NewLHS.hasOneUse()) {
3651  unsigned Opc = NewLHS.getOpcode();
3652  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3653  ShouldFoldNeg = false;
3654  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3655  ShouldFoldNeg = false;
3656  }
3657 
3658  if (ShouldFoldNeg) {
3659  if (LHS.getOpcode() == ISD::FNEG)
3660  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3661  else if (CRHS->isNegative())
3662  return SDValue();
3663 
3664  if (Inv)
3665  std::swap(NewLHS, NewRHS);
3666 
3667  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3668  Cond, NewLHS, NewRHS);
3669  DCI.AddToWorklist(NewSelect.getNode());
3670  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3671  }
3672  }
3673 
3674  return SDValue();
3675 }
3676 
3677 
3679  DAGCombinerInfo &DCI) const {
3680  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3681  return Folded;
3682 
3683  SDValue Cond = N->getOperand(0);
3684  if (Cond.getOpcode() != ISD::SETCC)
3685  return SDValue();
3686 
3687  EVT VT = N->getValueType(0);
3688  SDValue LHS = Cond.getOperand(0);
3689  SDValue RHS = Cond.getOperand(1);
3690  SDValue CC = Cond.getOperand(2);
3691 
3692  SDValue True = N->getOperand(1);
3693  SDValue False = N->getOperand(2);
3694 
3695  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3696  SelectionDAG &DAG = DCI.DAG;
3697  if (DAG.isConstantValueOfAnyType(True) &&
3698  !DAG.isConstantValueOfAnyType(False)) {
3699  // Swap cmp + select pair to move constant to false input.
3700  // This will allow using VOPC cndmasks more often.
3701  // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3702 
3703  SDLoc SL(N);
3704  ISD::CondCode NewCC =
3705  getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3706 
3707  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3708  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3709  }
3710 
3711  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3712  SDValue MinMax
3713  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3714  // Revisit this node so we can catch min3/max3/med3 patterns.
3715  //DCI.AddToWorklist(MinMax.getNode());
3716  return MinMax;