LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1459 1728 84.4 %
Date: 2018-02-23 15:42:53 Functions: 110 115 95.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief This is the parent TargetLowering class for hardware code gen
      12             : /// targets.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f
      17             : #define AMDGPU_LN2_F       0.693147180559945309417232121458176568f
      18             : #define AMDGPU_LN10_F      2.30258509299404568401799145468436421f
      19             : 
      20             : #include "AMDGPUISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUCallLowering.h"
      23             : #include "AMDGPUFrameLowering.h"
      24             : #include "AMDGPUIntrinsicInfo.h"
      25             : #include "AMDGPURegisterInfo.h"
      26             : #include "AMDGPUSubtarget.h"
      27             : #include "AMDGPUTargetMachine.h"
      28             : #include "R600MachineFunctionInfo.h"
      29             : #include "SIInstrInfo.h"
      30             : #include "SIMachineFunctionInfo.h"
      31             : #include "llvm/CodeGen/CallingConvLower.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      34             : #include "llvm/CodeGen/SelectionDAG.h"
      35             : #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
      36             : #include "llvm/IR/DataLayout.h"
      37             : #include "llvm/IR/DiagnosticInfo.h"
      38             : #include "llvm/Support/KnownBits.h"
      39             : using namespace llvm;
      40             : 
      41       39715 : static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
      42             :                             CCValAssign::LocInfo LocInfo,
      43             :                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
      44       39715 :   MachineFunction &MF = State.getMachineFunction();
      45       39715 :   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
      46             : 
      47       39715 :   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
      48             :                                          ArgFlags.getOrigAlign());
      49       79430 :   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
      50       39715 :   return true;
      51             : }
      52             : 
      53        1253 : static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
      54             :                            CCValAssign::LocInfo LocInfo,
      55             :                            ISD::ArgFlagsTy ArgFlags, CCState &State,
      56             :                            const TargetRegisterClass *RC,
      57             :                            unsigned NumRegs) {
      58        2506 :   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
      59        1253 :   unsigned RegResult = State.AllocateReg(RegList);
      60        1253 :   if (RegResult == AMDGPU::NoRegister)
      61             :     return false;
      62             : 
      63        2434 :   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
      64             :   return true;
      65             : }
      66             : 
      67         264 : static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      68             :                               CCValAssign::LocInfo LocInfo,
      69             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      70         264 :   switch (LocVT.SimpleTy) {
      71         264 :   case MVT::i64:
      72             :   case MVT::f64:
      73             :   case MVT::v2i32:
      74             :   case MVT::v2f32: {
      75             :     // Up to SGPR0-SGPR39
      76             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      77         264 :                           &AMDGPU::SGPR_64RegClass, 20);
      78             :   }
      79             :   default:
      80             :     return false;
      81             :   }
      82             : }
      83             : 
      84             : // Allocate up to VGPR31.
      85             : //
      86             : // TODO: Since there are no VGPR alignent requirements would it be better to
      87             : // split into individual scalar registers?
      88         989 : static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      89             :                               CCValAssign::LocInfo LocInfo,
      90             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      91         989 :   switch (LocVT.SimpleTy) {
      92         555 :   case MVT::i64:
      93             :   case MVT::f64:
      94             :   case MVT::v2i32:
      95             :   case MVT::v2f32: {
      96             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      97         555 :                           &AMDGPU::VReg_64RegClass, 31);
      98             :   }
      99         245 :   case MVT::v4i32:
     100             :   case MVT::v4f32:
     101             :   case MVT::v2i64:
     102             :   case MVT::v2f64: {
     103             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     104         245 :                           &AMDGPU::VReg_128RegClass, 29);
     105             :   }
     106          43 :   case MVT::v8i32:
     107             :   case MVT::v8f32: {
     108             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     109          43 :                           &AMDGPU::VReg_256RegClass, 25);
     110             : 
     111             :   }
     112         146 :   case MVT::v16i32:
     113             :   case MVT::v16f32: {
     114             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     115         146 :                           &AMDGPU::VReg_512RegClass, 17);
     116             : 
     117             :   }
     118             :   default:
     119             :     return false;
     120             :   }
     121             : }
     122             : 
     123             : #include "AMDGPUGenCallingConv.inc"
     124             : 
     125             : // Find a larger type to do a load / store of a vector with.
     126        4452 : EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     127             :   unsigned StoreSize = VT.getStoreSizeInBits();
     128        4452 :   if (StoreSize <= 32)
     129        1873 :     return EVT::getIntegerVT(Ctx, StoreSize);
     130             : 
     131             :   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     132        5158 :   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     133             : }
     134             : 
     135        8641 : unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
     136        8641 :   KnownBits Known;
     137       17282 :   EVT VT = Op.getValueType();
     138        8641 :   DAG.computeKnownBits(Op, Known);
     139             : 
     140       25923 :   return VT.getSizeInBits() - Known.countMinLeadingZeros();
     141             : }
     142             : 
     143        4284 : unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
     144        8568 :   EVT VT = Op.getValueType();
     145             : 
     146             :   // In order for this to be a signed 24-bit value, bit 23, must
     147             :   // be a sign bit.
     148        4284 :   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
     149             : }
     150             : 
     151        2326 : AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     152        2326 :                                            const AMDGPUSubtarget &STI)
     153        2326 :     : TargetLowering(TM), Subtarget(&STI) {
     154        2326 :   AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
     155             :   // Lower floating point store/load to integer store/load to reduce the number
     156             :   // of patterns in tablegen.
     157             :   setOperationAction(ISD::LOAD, MVT::f32, Promote);
     158             :   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
     159             : 
     160             :   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
     161             :   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
     162             : 
     163             :   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
     164             :   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
     165             : 
     166             :   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
     167             :   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
     168             : 
     169             :   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
     170             :   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
     171             : 
     172             :   setOperationAction(ISD::LOAD, MVT::i64, Promote);
     173             :   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
     174             : 
     175             :   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
     176             :   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
     177             : 
     178             :   setOperationAction(ISD::LOAD, MVT::f64, Promote);
     179             :   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
     180             : 
     181             :   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
     182             :   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
     183             : 
     184             :   // There are no 64-bit extloads. These should be done as a 32-bit extload and
     185             :   // an extension to 64-bit.
     186       16282 :   for (MVT VT : MVT::integer_valuetypes()) {
     187             :     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
     188             :     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
     189             :     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
     190             :   }
     191             : 
     192       16282 :   for (MVT VT : MVT::integer_valuetypes()) {
     193       13956 :     if (VT == MVT::i64)
     194        2326 :       continue;
     195             : 
     196             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     197             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
     198             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
     199             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
     200             : 
     201             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     202             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
     203             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
     204             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
     205             : 
     206             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     207             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
     208             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
     209             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
     210             :   }
     211             : 
     212      167472 :   for (MVT VT : MVT::integer_vector_valuetypes()) {
     213             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
     214             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
     215             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
     216             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
     217             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
     218             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
     219             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
     220             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
     221             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
     222             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
     223             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
     224             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
     225             :   }
     226             : 
     227        2326 :   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     228             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
     229             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
     230             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
     231             : 
     232             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     233             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
     234             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
     235             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
     236             : 
     237             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     238             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
     239             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
     240             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
     241             : 
     242             :   setOperationAction(ISD::STORE, MVT::f32, Promote);
     243             :   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
     244             : 
     245             :   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
     246             :   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
     247             : 
     248             :   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
     249             :   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
     250             : 
     251             :   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
     252             :   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
     253             : 
     254             :   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
     255             :   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
     256             : 
     257             :   setOperationAction(ISD::STORE, MVT::i64, Promote);
     258             :   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
     259             : 
     260             :   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
     261             :   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
     262             : 
     263             :   setOperationAction(ISD::STORE, MVT::f64, Promote);
     264             :   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
     265             : 
     266             :   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
     267             :   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
     268             : 
     269             :   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
     270             :   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
     271             :   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     272             :   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
     273             : 
     274             :   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
     275             :   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
     276             :   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
     277             :   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
     278             : 
     279             :   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
     280             :   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
     281             :   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
     282             :   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
     283             : 
     284             :   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
     285             :   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
     286             : 
     287             :   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
     288             :   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
     289             : 
     290             :   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
     291             :   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
     292             : 
     293             :   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
     294             :   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
     295             : 
     296             : 
     297             :   setOperationAction(ISD::Constant, MVT::i32, Legal);
     298             :   setOperationAction(ISD::Constant, MVT::i64, Legal);
     299             :   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
     300             :   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
     301             : 
     302             :   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
     303             :   setOperationAction(ISD::BRIND, MVT::Other, Expand);
     304             : 
     305             :   // This is totally unsupported, just custom lower to produce an error.
     306             :   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
     307             : 
     308             :   // Library functions.  These default to Expand, but we have instructions
     309             :   // for them.
     310             :   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     311             :   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
     312             :   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
     313             :   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
     314             :   setOperationAction(ISD::FABS,   MVT::f32, Legal);
     315             :   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     316             :   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
     317             :   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     318             :   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     319             :   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
     320             : 
     321             :   setOperationAction(ISD::FROUND, MVT::f32, Custom);
     322             :   setOperationAction(ISD::FROUND, MVT::f64, Custom);
     323             : 
     324             :   setOperationAction(ISD::FLOG, MVT::f32, Custom);
     325             :   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
     326             : 
     327        2326 :   if (Subtarget->has16BitInsts()) {
     328             :     setOperationAction(ISD::FLOG, MVT::f16, Custom);
     329             :     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
     330             :   }
     331             : 
     332             :   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
     333             :   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
     334             : 
     335             :   setOperationAction(ISD::FREM, MVT::f32, Custom);
     336             :   setOperationAction(ISD::FREM, MVT::f64, Custom);
     337             : 
     338             :   // v_mad_f32 does not support denormals according to some sources.
     339        2326 :   if (!Subtarget->hasFP32Denormals())
     340             :     setOperationAction(ISD::FMAD, MVT::f32, Legal);
     341             : 
     342             :   // Expand to fneg + fadd.
     343             :   setOperationAction(ISD::FSUB, MVT::f64, Expand);
     344             : 
     345             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
     346             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
     347             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
     348             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
     349             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
     350             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
     351             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
     352             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
     353             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
     354             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
     355             : 
     356        2326 :   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     357             :     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     358             :     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
     359             :     setOperationAction(ISD::FRINT, MVT::f64, Custom);
     360             :     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
     361             :   }
     362             : 
     363        2326 :   if (!Subtarget->hasBFI()) {
     364             :     // fcopysign can be done in a single instruction with BFI.
     365             :     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
     366             :     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     367             :   }
     368             : 
     369             :   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
     370             :   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
     371             :   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     372             : 
     373        2326 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     374       11630 :   for (MVT VT : ScalarIntVTs) {
     375             :     // These should use [SU]DIVREM, so set them to expand
     376             :     setOperationAction(ISD::SDIV, VT, Expand);
     377             :     setOperationAction(ISD::UDIV, VT, Expand);
     378             :     setOperationAction(ISD::SREM, VT, Expand);
     379             :     setOperationAction(ISD::UREM, VT, Expand);
     380             : 
     381             :     // GPU does not have divrem function for signed or unsigned.
     382             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     383             :     setOperationAction(ISD::UDIVREM, VT, Custom);
     384             : 
     385             :     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
     386             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     387             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     388             : 
     389             :     setOperationAction(ISD::BSWAP, VT, Expand);
     390             :     setOperationAction(ISD::CTTZ, VT, Expand);
     391             :     setOperationAction(ISD::CTLZ, VT, Expand);
     392             :   }
     393             : 
     394        2326 :   if (!Subtarget->hasBCNT(32))
     395             :     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
     396             : 
     397        2326 :   if (!Subtarget->hasBCNT(64))
     398             :     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     399             : 
     400             :   // The hardware supports 32-bit ROTR, but not ROTL.
     401             :   setOperationAction(ISD::ROTL, MVT::i32, Expand);
     402             :   setOperationAction(ISD::ROTL, MVT::i64, Expand);
     403             :   setOperationAction(ISD::ROTR, MVT::i64, Expand);
     404             : 
     405             :   setOperationAction(ISD::MUL, MVT::i64, Expand);
     406             :   setOperationAction(ISD::MULHU, MVT::i64, Expand);
     407             :   setOperationAction(ISD::MULHS, MVT::i64, Expand);
     408             :   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     409             :   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     410             :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     411             :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     412             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     413             : 
     414             :   setOperationAction(ISD::SMIN, MVT::i32, Legal);
     415             :   setOperationAction(ISD::UMIN, MVT::i32, Legal);
     416             :   setOperationAction(ISD::SMAX, MVT::i32, Legal);
     417             :   setOperationAction(ISD::UMAX, MVT::i32, Legal);
     418             : 
     419        2326 :   if (Subtarget->hasFFBH())
     420             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
     421             : 
     422        2326 :   if (Subtarget->hasFFBL())
     423             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
     424             : 
     425             :   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
     426             :   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
     427             :   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
     428             :   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     429             : 
     430             :   // We only really have 32-bit BFE instructions (and 16-bit on VI).
     431             :   //
     432             :   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
     433             :   // effort to match them now. We want this to be false for i64 cases when the
     434             :   // extraction isn't restricted to the upper or lower half. Ideally we would
     435             :   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
     436             :   // span the midpoint are probably relatively rare, so don't worry about them
     437             :   // for now.
     438        2326 :   if (Subtarget->hasBFE())
     439             :     setHasExtractBitsInsn(true);
     440             : 
     441             :   static const MVT::SimpleValueType VectorIntTypes[] = {
     442             :     MVT::v2i32, MVT::v4i32
     443             :   };
     444             : 
     445       11630 :   for (MVT VT : VectorIntTypes) {
     446             :     // Expand the following operations for the current type by default.
     447             :     setOperationAction(ISD::ADD,  VT, Expand);
     448             :     setOperationAction(ISD::AND,  VT, Expand);
     449             :     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     450             :     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     451             :     setOperationAction(ISD::MUL,  VT, Expand);
     452             :     setOperationAction(ISD::MULHU, VT, Expand);
     453             :     setOperationAction(ISD::MULHS, VT, Expand);
     454             :     setOperationAction(ISD::OR,   VT, Expand);
     455             :     setOperationAction(ISD::SHL,  VT, Expand);
     456             :     setOperationAction(ISD::SRA,  VT, Expand);
     457             :     setOperationAction(ISD::SRL,  VT, Expand);
     458             :     setOperationAction(ISD::ROTL, VT, Expand);
     459             :     setOperationAction(ISD::ROTR, VT, Expand);
     460             :     setOperationAction(ISD::SUB,  VT, Expand);
     461             :     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     462             :     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     463             :     setOperationAction(ISD::SDIV, VT, Expand);
     464             :     setOperationAction(ISD::UDIV, VT, Expand);
     465             :     setOperationAction(ISD::SREM, VT, Expand);
     466             :     setOperationAction(ISD::UREM, VT, Expand);
     467             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     468             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     469             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     470             :     setOperationAction(ISD::UDIVREM, VT, Expand);
     471             :     setOperationAction(ISD::ADDC, VT, Expand);
     472             :     setOperationAction(ISD::SUBC, VT, Expand);
     473             :     setOperationAction(ISD::ADDE, VT, Expand);
     474             :     setOperationAction(ISD::SUBE, VT, Expand);
     475             :     setOperationAction(ISD::SELECT, VT, Expand);
     476             :     setOperationAction(ISD::VSELECT, VT, Expand);
     477             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     478             :     setOperationAction(ISD::XOR,  VT, Expand);
     479             :     setOperationAction(ISD::BSWAP, VT, Expand);
     480             :     setOperationAction(ISD::CTPOP, VT, Expand);
     481             :     setOperationAction(ISD::CTTZ, VT, Expand);
     482             :     setOperationAction(ISD::CTLZ, VT, Expand);
     483             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     484             :     setOperationAction(ISD::SETCC, VT, Expand);
     485             :   }
     486             : 
     487             :   static const MVT::SimpleValueType FloatVectorTypes[] = {
     488             :     MVT::v2f32, MVT::v4f32
     489             :   };
     490             : 
     491       11630 :   for (MVT VT : FloatVectorTypes) {
     492             :     setOperationAction(ISD::FABS, VT, Expand);
     493             :     setOperationAction(ISD::FMINNUM, VT, Expand);
     494             :     setOperationAction(ISD::FMAXNUM, VT, Expand);
     495             :     setOperationAction(ISD::FADD, VT, Expand);
     496             :     setOperationAction(ISD::FCEIL, VT, Expand);
     497             :     setOperationAction(ISD::FCOS, VT, Expand);
     498             :     setOperationAction(ISD::FDIV, VT, Expand);
     499             :     setOperationAction(ISD::FEXP2, VT, Expand);
     500             :     setOperationAction(ISD::FLOG2, VT, Expand);
     501             :     setOperationAction(ISD::FREM, VT, Expand);
     502             :     setOperationAction(ISD::FLOG, VT, Expand);
     503             :     setOperationAction(ISD::FLOG10, VT, Expand);
     504             :     setOperationAction(ISD::FPOW, VT, Expand);
     505             :     setOperationAction(ISD::FFLOOR, VT, Expand);
     506             :     setOperationAction(ISD::FTRUNC, VT, Expand);
     507             :     setOperationAction(ISD::FMUL, VT, Expand);
     508             :     setOperationAction(ISD::FMA, VT, Expand);
     509             :     setOperationAction(ISD::FRINT, VT, Expand);
     510             :     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     511             :     setOperationAction(ISD::FSQRT, VT, Expand);
     512             :     setOperationAction(ISD::FSIN, VT, Expand);
     513             :     setOperationAction(ISD::FSUB, VT, Expand);
     514             :     setOperationAction(ISD::FNEG, VT, Expand);
     515             :     setOperationAction(ISD::VSELECT, VT, Expand);
     516             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     517             :     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     518             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     519             :     setOperationAction(ISD::SETCC, VT, Expand);
     520             :   }
     521             : 
     522             :   // This causes using an unrolled select operation rather than expansion with
     523             :   // bit operations. This is in general better, but the alternative using BFI
     524             :   // instructions may be better if the select sources are SGPRs.
     525             :   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
     526             :   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
     527             : 
     528             :   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
     529             :   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
     530             : 
     531             :   // There are no libcalls of any kind.
     532     2151550 :   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
     533             :     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
     534             : 
     535             :   setBooleanContents(ZeroOrNegativeOneBooleanContent);
     536             :   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     537             : 
     538             :   setSchedulingPreference(Sched::RegPressure);
     539        2326 :   setJumpIsExpensive(true);
     540             : 
     541             :   // FIXME: This is only partially true. If we have to do vector compares, any
     542             :   // SGPR pair can be a condition register. If we have a uniform condition, we
     543             :   // are better off doing SALU operations, where there is only one SCC. For now,
     544             :   // we don't have a way of knowing during instruction selection if a condition
     545             :   // will be uniform and we always use vector compares. Assume we are using
     546             :   // vector compares until that is fixed.
     547             :   setHasMultipleConditionRegisters(true);
     548             : 
     549             :   // SI at least has hardware support for floating point exceptions, but no way
     550             :   // of using or handling them is implemented. They are also optional in OpenCL
     551             :   // (Section 7.3)
     552        2326 :   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
     553             : 
     554        2326 :   PredictableSelectIsExpensive = false;
     555             : 
     556             :   // We want to find all load dependencies for long chains of stores to enable
     557             :   // merging into very wide vectors. The problem is with vectors with > 4
     558             :   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
     559             :   // vectors are a legal type, even though we have to split the loads
     560             :   // usually. When we can more precisely specify load legality per address
     561             :   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
     562             :   // smarter so that they can figure out what to do in 2 iterations without all
     563             :   // N > 4 stores on the same chain.
     564        2326 :   GatherAllAliasesMaxDepth = 16;
     565             : 
     566             :   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
     567             :   // about these during lowering.
     568        2326 :   MaxStoresPerMemcpy  = 0xffffffff;
     569        2326 :   MaxStoresPerMemmove = 0xffffffff;
     570        2326 :   MaxStoresPerMemset  = 0xffffffff;
     571             : 
     572             :   setTargetDAGCombine(ISD::BITCAST);
     573             :   setTargetDAGCombine(ISD::SHL);
     574             :   setTargetDAGCombine(ISD::SRA);
     575             :   setTargetDAGCombine(ISD::SRL);
     576             :   setTargetDAGCombine(ISD::MUL);
     577             :   setTargetDAGCombine(ISD::MULHU);
     578             :   setTargetDAGCombine(ISD::MULHS);
     579             :   setTargetDAGCombine(ISD::SELECT);
     580             :   setTargetDAGCombine(ISD::SELECT_CC);
     581             :   setTargetDAGCombine(ISD::STORE);
     582             :   setTargetDAGCombine(ISD::FADD);
     583             :   setTargetDAGCombine(ISD::FSUB);
     584             :   setTargetDAGCombine(ISD::FNEG);
     585             :   setTargetDAGCombine(ISD::FABS);
     586             :   setTargetDAGCombine(ISD::AssertZext);
     587             :   setTargetDAGCombine(ISD::AssertSext);
     588        2326 : }
     589             : 
     590             : //===----------------------------------------------------------------------===//
     591             : // Target Information
     592             : //===----------------------------------------------------------------------===//
     593             : 
     594             : LLVM_READNONE
     595         750 : static bool fnegFoldsIntoOp(unsigned Opc) {
     596         750 :   switch (Opc) {
     597             :   case ISD::FADD:
     598             :   case ISD::FSUB:
     599             :   case ISD::FMUL:
     600             :   case ISD::FMA:
     601             :   case ISD::FMAD:
     602             :   case ISD::FMINNUM:
     603             :   case ISD::FMAXNUM:
     604             :   case ISD::FSIN:
     605             :   case ISD::FTRUNC:
     606             :   case ISD::FRINT:
     607             :   case ISD::FNEARBYINT:
     608             :   case AMDGPUISD::RCP:
     609             :   case AMDGPUISD::RCP_LEGACY:
     610             :   case AMDGPUISD::SIN_HW:
     611             :   case AMDGPUISD::FMUL_LEGACY:
     612             :   case AMDGPUISD::FMIN_LEGACY:
     613             :   case AMDGPUISD::FMAX_LEGACY:
     614             :     return true;
     615         535 :   default:
     616         535 :     return false;
     617             :   }
     618             : }
     619             : 
     620             : /// \p returns true if the operation will definitely need to use a 64-bit
     621             : /// encoding, and thus will use a VOP3 encoding regardless of the source
     622             : /// modifiers.
     623             : LLVM_READONLY
     624             : static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
     625        2049 :   return N->getNumOperands() > 2 || VT == MVT::f64;
     626             : }
     627             : 
     628             : // Most FP instructions support source modifiers, but this could be refined
     629             : // slightly.
     630             : LLVM_READONLY
     631        2648 : static bool hasSourceMods(const SDNode *N) {
     632        2648 :   if (isa<MemSDNode>(N))
     633             :     return false;
     634             : 
     635        4786 :   switch (N->getOpcode()) {
     636             :   case ISD::CopyToReg:
     637             :   case ISD::SELECT:
     638             :   case ISD::FDIV:
     639             :   case ISD::FREM:
     640             :   case ISD::INLINEASM:
     641             :   case AMDGPUISD::INTERP_P1:
     642             :   case AMDGPUISD::INTERP_P2:
     643             :   case AMDGPUISD::DIV_SCALE:
     644             : 
     645             :   // TODO: Should really be looking at the users of the bitcast. These are
     646             :   // problematic because bitcasts are used to legalize all stores to integer
     647             :   // types.
     648             :   case ISD::BITCAST:
     649             :     return false;
     650        2049 :   default:
     651        2049 :     return true;
     652             :   }
     653             : }
     654             : 
     655        2553 : bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
     656             :                                                  unsigned CostThreshold) {
     657             :   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
     658             :   // it is truly free to use a source modifier in all cases. If there are
     659             :   // multiple users but for each one will necessitate using VOP3, there will be
     660             :   // a code size increase. Try to avoid increasing code size unless we know it
     661             :   // will save on the instruction count.
     662             :   unsigned NumMayIncreaseSize = 0;
     663        5106 :   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
     664             : 
     665             :   // XXX - Should this limit number of uses to check?
     666        3643 :   for (const SDNode *U : N->uses()) {
     667        2648 :     if (!hasSourceMods(U))
     668             :       return false;
     669             : 
     670             :     if (!opMustUseVOP3Encoding(U, VT)) {
     671        1081 :       if (++NumMayIncreaseSize > CostThreshold)
     672             :         return false;
     673             :     }
     674             :   }
     675             : 
     676             :   return true;
     677             : }
     678             : 
     679       94595 : MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
     680       94595 :   return MVT::i32;
     681             : }
     682             : 
     683        1537 : bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
     684        1537 :   return true;
     685             : }
     686             : 
     687             : // The backend supports 32 and 64 bit floating point immediates.
     688             : // FIXME: Why are we reporting vectors of FP immediates as legal?
     689           0 : bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     690           0 :   EVT ScalarVT = VT.getScalarType();
     691             :   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
     692           0 :          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
     693             : }
     694             : 
     695             : // We don't want to shrink f64 / f32 constants.
     696           0 : bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
     697           0 :   EVT ScalarVT = VT.getScalarType();
     698           0 :   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
     699             : }
     700             : 
     701        1617 : bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
     702             :                                                  ISD::LoadExtType,
     703             :                                                  EVT NewVT) const {
     704             : 
     705             :   unsigned NewSize = NewVT.getStoreSizeInBits();
     706             : 
     707             :   // If we are reducing to a 32-bit load, this is always better.
     708        1617 :   if (NewSize == 32)
     709             :     return true;
     710             : 
     711        2586 :   EVT OldVT = N->getValueType(0);
     712             :   unsigned OldSize = OldVT.getStoreSizeInBits();
     713             : 
     714             :   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
     715             :   // extloads, so doing one requires using a buffer_load. In cases where we
     716             :   // still couldn't use a scalar load, using the wider load shouldn't really
     717             :   // hurt anything.
     718             : 
     719             :   // If the old size already had to be an extload, there's no harm in continuing
     720             :   // to reduce the width.
     721        1293 :   return (OldSize < 32);
     722             : }
     723             : 
     724       13208 : bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
     725             :                                                    EVT CastTy) const {
     726             : 
     727             :   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
     728             : 
     729       26416 :   if (LoadTy.getScalarType() == MVT::i32)
     730             :     return false;
     731             : 
     732             :   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
     733             :   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
     734             : 
     735        1687 :   return (LScalarSize < CastScalarSize) ||
     736        1687 :          (CastScalarSize >= 32);
     737             : }
     738             : 
     739             : // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
     740             : // profitable with the expansion for 64-bit since it's generally good to
     741             : // speculate things.
     742             : // FIXME: These should really have the size as a parameter.
     743          27 : bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
     744          27 :   return true;
     745             : }
     746             : 
     747          60 : bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
     748          60 :   return true;
     749             : }
     750             : 
     751             : //===---------------------------------------------------------------------===//
     752             : // Target Properties
     753             : //===---------------------------------------------------------------------===//
     754             : 
     755        1936 : bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
     756             :   assert(VT.isFloatingPoint());
     757             : 
     758             :   // Packed operations do not have a fabs modifier.
     759         438 :   return VT == MVT::f32 || VT == MVT::f64 ||
     760        2374 :          (Subtarget->has16BitInsts() && VT == MVT::f16);
     761             : }
     762             : 
     763        3368 : bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
     764             :   assert(VT.isFloatingPoint());
     765         535 :   return VT == MVT::f32 || VT == MVT::f64 ||
     766         757 :          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
     767        3590 :          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
     768             : }
     769             : 
     770        4205 : bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
     771             :                                                          unsigned NumElem,
     772             :                                                          unsigned AS) const {
     773        4205 :   return true;
     774             : }
     775             : 
     776       23173 : bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
     777             :   // There are few operations which truly have vector input operands. Any vector
     778             :   // operation is going to involve operations on each component, and a
     779             :   // build_vector will be a copy per element, so it always makes sense to use a
     780             :   // build_vector input in place of the extracted element to avoid a copy into a
     781             :   // super register.
     782             :   //
     783             :   // We should probably only do this if all users are extracts only, but this
     784             :   // should be the common case.
     785       23173 :   return true;
     786             : }
     787             : 
     788       12518 : bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
     789             :   // Truncate is just accessing a subregister.
     790             : 
     791       12518 :   unsigned SrcSize = Source.getSizeInBits();
     792       12518 :   unsigned DestSize = Dest.getSizeInBits();
     793             : 
     794       12518 :   return DestSize < SrcSize && DestSize % 32 == 0 ;
     795             : }
     796             : 
     797         588 : bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
     798             :   // Truncate is just accessing a subregister.
     799             : 
     800         588 :   unsigned SrcSize = Source->getScalarSizeInBits();
     801         588 :   unsigned DestSize = Dest->getScalarSizeInBits();
     802             : 
     803         588 :   if (DestSize== 16 && Subtarget->has16BitInsts())
     804          20 :     return SrcSize >= 32;
     805             : 
     806         568 :   return DestSize < SrcSize && DestSize % 32 == 0;
     807             : }
     808             : 
     809          28 : bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
     810          28 :   unsigned SrcSize = Src->getScalarSizeInBits();
     811          28 :   unsigned DestSize = Dest->getScalarSizeInBits();
     812             : 
     813          28 :   if (SrcSize == 16 && Subtarget->has16BitInsts())
     814           0 :     return DestSize >= 32;
     815             : 
     816          28 :   return SrcSize == 32 && DestSize == 64;
     817             : }
     818             : 
     819        6174 : bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
     820             :   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
     821             :   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
     822             :   // this will enable reducing 64-bit operations the 32-bit, which is always
     823             :   // good.
     824             : 
     825             :   if (Src == MVT::i16)
     826             :     return Dest == MVT::i32 ||Dest == MVT::i64 ;
     827             : 
     828             :   return Src == MVT::i32 && Dest == MVT::i64;
     829             : }
     830             : 
     831        6046 : bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
     832       12092 :   return isZExtFree(Val.getValueType(), VT2);
     833             : }
     834             : 
     835             : // v_mad_mix* support a conversion from f16 to f32.
     836             : //
     837             : // There is only one special case when denormals are enabled we don't currently,
     838             : // where this is OK to use.
     839          24 : bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
     840             :                                            EVT DestVT, EVT SrcVT) const {
     841          22 :   return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
     842          70 :          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
     843          59 :          SrcVT.getScalarType() == MVT::f16;
     844             : }
     845             : 
     846        6686 : bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
     847             :   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
     848             :   // limited number of native 64-bit operations. Shrinking an operation to fit
     849             :   // in a single 32-bit register should always be helpful. As currently used,
     850             :   // this is much less general than the name suggests, and is only used in
     851             :   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
     852             :   // not profitable, and may actually be harmful.
     853        6686 :   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
     854             : }
     855             : 
     856             : //===---------------------------------------------------------------------===//
     857             : // TargetLowering Callbacks
     858             : //===---------------------------------------------------------------------===//
     859             : 
     860        2631 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
     861             :                                                   bool IsVarArg) {
     862        2631 :   switch (CC) {
     863             :   case CallingConv::AMDGPU_KERNEL:
     864             :   case CallingConv::SPIR_KERNEL:
     865             :     return CC_AMDGPU_Kernel;
     866         858 :   case CallingConv::AMDGPU_VS:
     867             :   case CallingConv::AMDGPU_GS:
     868             :   case CallingConv::AMDGPU_PS:
     869             :   case CallingConv::AMDGPU_CS:
     870             :   case CallingConv::AMDGPU_HS:
     871             :   case CallingConv::AMDGPU_ES:
     872             :   case CallingConv::AMDGPU_LS:
     873         858 :     return CC_AMDGPU;
     874        1747 :   case CallingConv::C:
     875             :   case CallingConv::Fast:
     876             :   case CallingConv::Cold:
     877        1747 :     return CC_AMDGPU_Func;
     878           0 :   default:
     879           0 :     report_fatal_error("Unsupported calling convention.");
     880             :   }
     881             : }
     882             : 
     883        4017 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
     884             :                                                     bool IsVarArg) {
     885        4017 :   switch (CC) {
     886             :   case CallingConv::AMDGPU_KERNEL:
     887             :   case CallingConv::SPIR_KERNEL:
     888             :     return CC_AMDGPU_Kernel;
     889         803 :   case CallingConv::AMDGPU_VS:
     890             :   case CallingConv::AMDGPU_GS:
     891             :   case CallingConv::AMDGPU_PS:
     892             :   case CallingConv::AMDGPU_CS:
     893             :   case CallingConv::AMDGPU_HS:
     894             :   case CallingConv::AMDGPU_ES:
     895             :   case CallingConv::AMDGPU_LS:
     896         803 :     return RetCC_SI_Shader;
     897        3214 :   case CallingConv::C:
     898             :   case CallingConv::Fast:
     899             :   case CallingConv::Cold:
     900        3214 :     return RetCC_AMDGPU_Func;
     901           0 :   default:
     902           0 :     report_fatal_error("Unsupported calling convention.");
     903             :   }
     904             : }
     905             : 
     906             : /// The SelectionDAGBuilder will automatically promote function arguments
     907             : /// with illegal types.  However, this does not work for the AMDGPU targets
     908             : /// since the function arguments are stored in memory as these illegal types.
     909             : /// In order to handle this properly we need to get the original types sizes
     910             : /// from the LLVM IR Function and fixup the ISD:InputArg values before
     911             : /// passing them to AnalyzeFormalArguments()
     912             : 
     913             : /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
     914             : /// input values across multiple registers.  Each item in the Ins array
     915             : /// represents a single value that will be stored in registers.  Ins[x].VT is
     916             : /// the value type of the value that will be stored in the register, so
     917             : /// whatever SDNode we lower the argument to needs to be this type.
     918             : ///
     919             : /// In order to correctly lower the arguments we need to know the size of each
     920             : /// argument.  Since Ins[x].VT gives us the size of the register that will
     921             : /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
     922             : /// for the orignal function argument so that we can deduce the correct memory
     923             : /// type to use for Ins[x].  In most cases the correct memory type will be
     924             : /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
     925             : /// we have a kernel argument of type v8i8, this argument will be split into
     926             : /// 8 parts and each part will be represented by its own item in the Ins array.
     927             : /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
     928             : /// the argument before it was split.  From this, we deduce that the memory type
     929             : /// for each individual part is i8.  We pass the memory type as LocVT to the
     930             : /// calling convention analysis function and the register type (Ins[x].VT) as
     931             : /// the ValVT.
     932       16735 : void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
     933             :                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
     934       56424 :   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     935       39689 :     const ISD::InputArg &In = Ins[i];
     936       39689 :     EVT MemVT;
     937             : 
     938       39689 :     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
     939             : 
     940       79378 :     if (!Subtarget->isAmdHsaOS() &&
     941             :         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
     942             :       // The ABI says the caller will extend these values to 32-bits.
     943        1422 :       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
     944       38978 :     } else if (NumRegs == 1) {
     945             :       // This argument is not split, so the IR type is the memory type.
     946             :       assert(!In.Flags.isSplit());
     947       35669 :       if (In.ArgVT.isExtended()) {
     948             :         // We have an extended type, like i24, so we should just use the register type
     949          84 :         MemVT = In.VT;
     950             :       } else {
     951       35585 :         MemVT = In.ArgVT;
     952             :       }
     953        9339 :     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
     954        4753 :                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
     955             :       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
     956             :       // We have a vector value which has been split into a vector with
     957             :       // the same scalar type, but fewer elements.  This should handle
     958             :       // all the floating-point vector types.
     959         722 :       MemVT = In.VT;
     960        4586 :     } else if (In.ArgVT.isVector() &&
     961        1999 :                In.ArgVT.getVectorNumElements() == NumRegs) {
     962             :       // This arg has been split so that each element is stored in a separate
     963             :       // register.
     964        1989 :       MemVT = In.ArgVT.getScalarType();
     965         598 :     } else if (In.ArgVT.isExtended()) {
     966             :       // We have an extended type, like i65.
     967          26 :       MemVT = In.VT;
     968             :     } else {
     969         572 :       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
     970             :       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
     971         572 :       if (In.VT.isInteger()) {
     972         572 :         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
     973           0 :       } else if (In.VT.isVector()) {
     974             :         assert(!In.VT.getScalarType().isFloatingPoint());
     975           0 :         unsigned NumElements = In.VT.getVectorNumElements();
     976             :         assert(MemoryBits % NumElements == 0);
     977             :         // This vector type has been split into another vector type with
     978             :         // a different elements size.
     979             :         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
     980           0 :                                          MemoryBits / NumElements);
     981           0 :         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
     982             :       } else {
     983           0 :         llvm_unreachable("cannot deduce memory type.");
     984             :       }
     985             :     }
     986             : 
     987             :     // Convert one element vectors to scalar.
     988       39689 :     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
     989          50 :       MemVT = MemVT.getScalarType();
     990             : 
     991       39689 :     if (MemVT.isExtended()) {
     992             :       // This should really only happen if we have vec3 arguments
     993             :       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
     994           0 :       MemVT = MemVT.getPow2VectorType(State.getContext());
     995             :     }
     996             : 
     997             :     assert(MemVT.isSimple());
     998       39689 :     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
     999             :                     State);
    1000             :   }
    1001       16735 : }
    1002             : 
    1003       16765 : SDValue AMDGPUTargetLowering::LowerReturn(
    1004             :   SDValue Chain, CallingConv::ID CallConv,
    1005             :   bool isVarArg,
    1006             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1007             :   const SmallVectorImpl<SDValue> &OutVals,
    1008             :   const SDLoc &DL, SelectionDAG &DAG) const {
    1009             :   // FIXME: Fails for r600 tests
    1010             :   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
    1011             :   // "wave terminate should not have return values");
    1012       16765 :   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
    1013             : }
    1014             : 
    1015             : //===---------------------------------------------------------------------===//
    1016             : // Target specific lowering
    1017             : //===---------------------------------------------------------------------===//
    1018             : 
    1019             : /// Selects the correct CCAssignFn for a given CallingConvention value.
    1020        2595 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    1021             :                                                     bool IsVarArg) {
    1022        2595 :   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
    1023             : }
    1024             : 
    1025        4017 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
    1026             :                                                       bool IsVarArg) {
    1027        4017 :   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
    1028             : }
    1029             : 
    1030          27 : SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
    1031             :                                                   SelectionDAG &DAG,
    1032             :                                                   MachineFrameInfo &MFI,
    1033             :                                                   int ClobberedFI) const {
    1034             :   SmallVector<SDValue, 8> ArgChains;
    1035             :   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
    1036          27 :   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
    1037             : 
    1038             :   // Include the original chain at the beginning of the list. When this is
    1039             :   // used by target LowerCall hooks, this helps legalize find the
    1040             :   // CALLSEQ_BEGIN node.
    1041          27 :   ArgChains.push_back(Chain);
    1042             : 
    1043             :   // Add a chain value for each stack argument corresponding
    1044          27 :   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
    1045             :                             UE = DAG.getEntryNode().getNode()->use_end();
    1046         918 :        U != UE; ++U) {
    1047             :     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
    1048             :       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
    1049          72 :         if (FI->getIndex() < 0) {
    1050             :           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
    1051             :           int64_t InLastByte = InFirstByte;
    1052          72 :           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
    1053             : 
    1054         132 :           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
    1055          60 :               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
    1056          12 :             ArgChains.push_back(SDValue(L, 1));
    1057             :         }
    1058             :       }
    1059             :     }
    1060             :   }
    1061             : 
    1062             :   // Build a tokenfactor for all the chains.
    1063          81 :   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
    1064             : }
    1065             : 
    1066          83 : SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
    1067             :                                                  SmallVectorImpl<SDValue> &InVals,
    1068             :                                                  StringRef Reason) const {
    1069          83 :   SDValue Callee = CLI.Callee;
    1070          83 :   SelectionDAG &DAG = CLI.DAG;
    1071             : 
    1072          83 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1073             : 
    1074             :   StringRef FuncName("<unknown>");
    1075             : 
    1076             :   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    1077         128 :     FuncName = G->getSymbol();
    1078             :   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    1079          17 :     FuncName = G->getGlobal()->getName();
    1080             : 
    1081             :   DiagnosticInfoUnsupported NoCalls(
    1082         166 :     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
    1083          83 :   DAG.getContext()->diagnose(NoCalls);
    1084             : 
    1085          82 :   if (!CLI.IsTailCall) {
    1086         151 :     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
    1087         146 :       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
    1088             :   }
    1089             : 
    1090          82 :   return DAG.getEntryNode();
    1091             : }
    1092             : 
    1093          77 : SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    1094             :                                         SmallVectorImpl<SDValue> &InVals) const {
    1095          77 :   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
    1096             : }
    1097             : 
    1098           3 : SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    1099             :                                                       SelectionDAG &DAG) const {
    1100           3 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1101             : 
    1102             :   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
    1103           9 :                                             SDLoc(Op).getDebugLoc());
    1104           3 :   DAG.getContext()->diagnose(NoDynamicAlloca);
    1105           9 :   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
    1106           9 :   return DAG.getMergeValues(Ops, SDLoc());
    1107             : }
    1108             : 
    1109       19751 : SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    1110             :                                              SelectionDAG &DAG) const {
    1111       19751 :   switch (Op.getOpcode()) {
    1112           0 :   default:
    1113           0 :     Op->print(errs(), &DAG);
    1114           0 :     llvm_unreachable("Custom lowering code for this"
    1115             :                      "instruction is not implemented yet!");
    1116             :     break;
    1117          30 :   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    1118        2486 :   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    1119       15057 :   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    1120         326 :   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    1121         158 :   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    1122          36 :   case ISD::FREM: return LowerFREM(Op, DAG);
    1123          31 :   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    1124          75 :   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    1125          14 :   case ISD::FRINT: return LowerFRINT(Op, DAG);
    1126          45 :   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    1127          90 :   case ISD::FROUND: return LowerFROUND(Op, DAG);
    1128           0 :   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    1129          37 :   case ISD::FLOG:
    1130          37 :     return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
    1131          37 :   case ISD::FLOG10:
    1132          37 :     return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
    1133          42 :   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    1134          51 :   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    1135         716 :   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
    1136          65 :   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    1137          37 :   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
    1138         415 :   case ISD::CTTZ:
    1139             :   case ISD::CTTZ_ZERO_UNDEF:
    1140             :   case ISD::CTLZ:
    1141             :   case ISD::CTLZ_ZERO_UNDEF:
    1142         415 :     return LowerCTLZ_CTTZ(Op, DAG);
    1143           3 :   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    1144             :   }
    1145             :   return Op;
    1146             : }
    1147             : 
    1148          53 : void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    1149             :                                               SmallVectorImpl<SDValue> &Results,
    1150             :                                               SelectionDAG &DAG) const {
    1151             :   switch (N->getOpcode()) {
    1152             :   case ISD::SIGN_EXTEND_INREG:
    1153             :     // Different parts of legalization seem to interpret which type of
    1154             :     // sign_extend_inreg is the one to check for custom lowering. The extended
    1155             :     // from type is what really matters, but some places check for custom
    1156             :     // lowering of the result type. This results in trying to use
    1157             :     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    1158             :     // nothing here and let the illegal result integer be handled normally.
    1159             :     return;
    1160             :   default:
    1161             :     return;
    1162             :   }
    1163             : }
    1164             : 
    1165             : static bool hasDefinedInitializer(const GlobalValue *GV) {
    1166             :   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
    1167         401 :   if (!GVar || !GVar->hasInitializer())
    1168             :     return false;
    1169             : 
    1170         395 :   return !isa<UndefValue>(GVar->getInitializer());
    1171             : }
    1172             : 
    1173         401 : SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    1174             :                                                  SDValue Op,
    1175             :                                                  SelectionDAG &DAG) const {
    1176             : 
    1177         401 :   const DataLayout &DL = DAG.getDataLayout();
    1178             :   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    1179         401 :   const GlobalValue *GV = G->getGlobal();
    1180             : 
    1181         401 :   if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
    1182             :     // XXX: What does the value of G->getOffset() mean?
    1183             :     assert(G->getOffset() == 0 &&
    1184             :          "Do not know what to do with an non-zero offset");
    1185             : 
    1186             :     // TODO: We could emit code to handle the initialization somewhere.
    1187         395 :     if (!hasDefinedInitializer(GV)) {
    1188         393 :       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
    1189         786 :       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
    1190             :     }
    1191             :   }
    1192             : 
    1193           8 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1194             :   DiagnosticInfoUnsupported BadInit(
    1195          24 :       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
    1196           8 :   DAG.getContext()->diagnose(BadInit);
    1197           8 :   return SDValue();
    1198             : }
    1199             : 
    1200        2486 : SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    1201             :                                                   SelectionDAG &DAG) const {
    1202             :   SmallVector<SDValue, 8> Args;
    1203             : 
    1204       12438 :   for (const SDUse &U : Op->ops())
    1205        4976 :     DAG.ExtractVectorElements(U.get(), Args);
    1206             : 
    1207        7458 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1208             : }
    1209             : 
    1210       15057 : SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    1211             :                                                      SelectionDAG &DAG) const {
    1212             : 
    1213             :   SmallVector<SDValue, 8> Args;
    1214       30114 :   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    1215       15057 :   EVT VT = Op.getValueType();
    1216       30114 :   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    1217             :                             VT.getVectorNumElements());
    1218             : 
    1219       45171 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1220             : }
    1221             : 
    1222             : /// \brief Generate Min/Max node
    1223         741 : SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
    1224             :                                                    SDValue LHS, SDValue RHS,
    1225             :                                                    SDValue True, SDValue False,
    1226             :                                                    SDValue CC,
    1227             :                                                    DAGCombinerInfo &DCI) const {
    1228             :   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
    1229         646 :     return SDValue();
    1230             : 
    1231          95 :   SelectionDAG &DAG = DCI.DAG;
    1232          95 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    1233          95 :   switch (CCOpcode) {
    1234             :   case ISD::SETOEQ:
    1235             :   case ISD::SETONE:
    1236             :   case ISD::SETUNE:
    1237             :   case ISD::SETNE:
    1238             :   case ISD::SETUEQ:
    1239             :   case ISD::SETEQ:
    1240             :   case ISD::SETFALSE:
    1241             :   case ISD::SETFALSE2:
    1242             :   case ISD::SETTRUE:
    1243             :   case ISD::SETTRUE2:
    1244             :   case ISD::SETUO:
    1245             :   case ISD::SETO:
    1246             :     break;
    1247             :   case ISD::SETULE:
    1248             :   case ISD::SETULT: {
    1249             :     if (LHS == True)
    1250          14 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1251           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1252             :   }
    1253           4 :   case ISD::SETOLE:
    1254             :   case ISD::SETOLT:
    1255             :   case ISD::SETLE:
    1256             :   case ISD::SETLT: {
    1257             :     // Ordered. Assume ordered for undefined.
    1258             : 
    1259             :     // Only do this after legalization to avoid interfering with other combines
    1260             :     // which might occur.
    1261           6 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1262           2 :         !DCI.isCalledByLegalizer())
    1263           2 :       return SDValue();
    1264             : 
    1265             :     // We need to permute the operands to get the correct NaN behavior. The
    1266             :     // selected operand is the second one based on the failing compare with NaN,
    1267             :     // so permute it based on the compare type the hardware uses.
    1268             :     if (LHS == True)
    1269           2 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1270           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1271             :   }
    1272             :   case ISD::SETUGE:
    1273             :   case ISD::SETUGT: {
    1274             :     if (LHS == True)
    1275          10 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1276           1 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1277             :   }
    1278          39 :   case ISD::SETGT:
    1279             :   case ISD::SETGE:
    1280             :   case ISD::SETOGE:
    1281             :   case ISD::SETOGT: {
    1282          72 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1283          33 :         !DCI.isCalledByLegalizer())
    1284          14 :       return SDValue();
    1285             : 
    1286             :     if (LHS == True)
    1287          16 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1288           9 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1289             :   }
    1290           0 :   case ISD::SETCC_INVALID:
    1291           0 :     llvm_unreachable("Invalid setcc condcode!");
    1292             :   }
    1293          27 :   return SDValue();
    1294             : }
    1295             : 
    1296             : std::pair<SDValue, SDValue>
    1297        2396 : AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
    1298             :   SDLoc SL(Op);
    1299             : 
    1300        2396 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1301             : 
    1302        2396 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1303        2396 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1304             : 
    1305        2396 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1306        2396 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1307             : 
    1308        2396 :   return std::make_pair(Lo, Hi);
    1309             : }
    1310             : 
    1311           0 : SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
    1312             :   SDLoc SL(Op);
    1313             : 
    1314           0 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1315           0 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1316           0 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1317             : }
    1318             : 
    1319         132 : SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
    1320             :   SDLoc SL(Op);
    1321             : 
    1322         132 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1323         132 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1324         264 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1325             : }
    1326             : 
    1327        2076 : SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
    1328             :                                               SelectionDAG &DAG) const {
    1329             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1330        4152 :   EVT VT = Op.getValueType();
    1331             : 
    1332             : 
    1333             :   // If this is a 2 element vector, we really want to scalarize and not create
    1334             :   // weird 1 element vectors.
    1335        2076 :   if (VT.getVectorNumElements() == 2)
    1336           0 :     return scalarizeVectorLoad(Load, DAG);
    1337             : 
    1338        2076 :   SDValue BasePtr = Load->getBasePtr();
    1339        2076 :   EVT MemVT = Load->getMemoryVT();
    1340             :   SDLoc SL(Op);
    1341             : 
    1342        2076 :   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
    1343             : 
    1344        2076 :   EVT LoVT, HiVT;
    1345        2076 :   EVT LoMemVT, HiMemVT;
    1346             :   SDValue Lo, Hi;
    1347             : 
    1348        4152 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1349        4152 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1350        4152 :   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
    1351             : 
    1352             :   unsigned Size = LoMemVT.getStoreSize();
    1353        2076 :   unsigned BaseAlign = Load->getAlignment();
    1354        4152 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1355             : 
    1356             :   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
    1357             :                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
    1358        4152 :                                   BaseAlign, Load->getMemOperand()->getFlags());
    1359        2076 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
    1360             :   SDValue HiLoad =
    1361             :       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
    1362             :                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
    1363        6228 :                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
    1364             : 
    1365             :   SDValue Ops[] = {
    1366        2076 :     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
    1367             :     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    1368        4152 :                 LoLoad.getValue(1), HiLoad.getValue(1))
    1369        4152 :   };
    1370             : 
    1371        2076 :   return DAG.getMergeValues(Ops, SL);
    1372             : }
    1373             : 
    1374        6902 : SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
    1375             :                                                SelectionDAG &DAG) const {
    1376             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    1377        6902 :   SDValue Val = Store->getValue();
    1378       13804 :   EVT VT = Val.getValueType();
    1379             : 
    1380             :   // If this is a 2 element vector, we really want to scalarize and not create
    1381             :   // weird 1 element vectors.
    1382        6902 :   if (VT.getVectorNumElements() == 2)
    1383           0 :     return scalarizeVectorStore(Store, DAG);
    1384             : 
    1385        6902 :   EVT MemVT = Store->getMemoryVT();
    1386        6902 :   SDValue Chain = Store->getChain();
    1387        6902 :   SDValue BasePtr = Store->getBasePtr();
    1388             :   SDLoc SL(Op);
    1389             : 
    1390        6902 :   EVT LoVT, HiVT;
    1391        6902 :   EVT LoMemVT, HiMemVT;
    1392             :   SDValue Lo, Hi;
    1393             : 
    1394       13804 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1395       13804 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1396       13804 :   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
    1397             : 
    1398        6902 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
    1399             : 
    1400        6902 :   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
    1401             :   unsigned BaseAlign = Store->getAlignment();
    1402             :   unsigned Size = LoMemVT.getStoreSize();
    1403       13804 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1404             : 
    1405             :   SDValue LoStore =
    1406             :       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
    1407        6902 :                         Store->getMemOperand()->getFlags());
    1408             :   SDValue HiStore =
    1409             :       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
    1410        6902 :                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
    1411             : 
    1412        6902 :   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
    1413             : }
    1414             : 
    1415             : // This is a shortcut for integer division because we have fast i32<->f32
    1416             : // conversions, and fast f32 reciprocal instructions. The fractional part of a
    1417             : // float is enough to accurately represent up to a 24-bit signed integer.
    1418         410 : SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
    1419             :                                             bool Sign) const {
    1420             :   SDLoc DL(Op);
    1421         410 :   EVT VT = Op.getValueType();
    1422         410 :   SDValue LHS = Op.getOperand(0);
    1423         410 :   SDValue RHS = Op.getOperand(1);
    1424             :   MVT IntVT = MVT::i32;
    1425             :   MVT FltVT = MVT::f32;
    1426             : 
    1427         410 :   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
    1428         410 :   if (LHSSignBits < 9)
    1429         308 :     return SDValue();
    1430             : 
    1431         102 :   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
    1432         102 :   if (RHSSignBits < 9)
    1433          13 :     return SDValue();
    1434             : 
    1435          89 :   unsigned BitSize = VT.getSizeInBits();
    1436          89 :   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
    1437          89 :   unsigned DivBits = BitSize - SignBits;
    1438          89 :   if (Sign)
    1439          48 :     ++DivBits;
    1440             : 
    1441          89 :   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
    1442          89 :   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
    1443             : 
    1444          89 :   SDValue jq = DAG.getConstant(1, DL, IntVT);
    1445             : 
    1446          89 :   if (Sign) {
    1447             :     // char|short jq = ia ^ ib;
    1448          48 :     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
    1449             : 
    1450             :     // jq = jq >> (bitsize - 2)
    1451          48 :     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
    1452          96 :                      DAG.getConstant(BitSize - 2, DL, VT));
    1453             : 
    1454             :     // jq = jq | 0x1
    1455          48 :     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
    1456             :   }
    1457             : 
    1458             :   // int ia = (int)LHS;
    1459          89 :   SDValue ia = LHS;
    1460             : 
    1461             :   // int ib, (int)RHS;
    1462          89 :   SDValue ib = RHS;
    1463             : 
    1464             :   // float fa = (float)ia;
    1465          89 :   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
    1466             : 
    1467             :   // float fb = (float)ib;
    1468          89 :   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
    1469             : 
    1470             :   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
    1471         178 :                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
    1472             : 
    1473             :   // fq = trunc(fq);
    1474          89 :   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
    1475             : 
    1476             :   // float fqneg = -fq;
    1477          89 :   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
    1478             : 
    1479             :   // float fr = mad(fqneg, fb, fa);
    1480          89 :   unsigned OpCode = Subtarget->hasFP32Denormals() ?
    1481             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1482             :                     (unsigned)ISD::FMAD;
    1483          89 :   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
    1484             : 
    1485             :   // int iq = (int)fq;
    1486          89 :   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
    1487             : 
    1488             :   // fr = fabs(fr);
    1489          89 :   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
    1490             : 
    1491             :   // fb = fabs(fb);
    1492          89 :   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
    1493             : 
    1494         178 :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    1495             : 
    1496             :   // int cv = fr >= fb;
    1497          89 :   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
    1498             : 
    1499             :   // jq = (cv ? jq : 0);
    1500          89 :   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
    1501             : 
    1502             :   // dst = iq + jq;
    1503          89 :   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
    1504             : 
    1505             :   // Rem needs compensation, it's easier to recompute it
    1506          89 :   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
    1507          89 :   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
    1508             : 
    1509             :   // Truncate to number of bits this divide really is.
    1510          89 :   if (Sign) {
    1511             :     SDValue InRegSize
    1512          48 :       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
    1513          48 :     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
    1514          48 :     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
    1515             :   } else {
    1516          41 :     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
    1517          41 :     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
    1518          41 :     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
    1519             :   }
    1520             : 
    1521         178 :   return DAG.getMergeValues({ Div, Rem }, DL);
    1522             : }
    1523             : 
    1524          72 : void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
    1525             :                                       SelectionDAG &DAG,
    1526             :                                       SmallVectorImpl<SDValue> &Results) const {
    1527             :   SDLoc DL(Op);
    1528          72 :   EVT VT = Op.getValueType();
    1529             : 
    1530             :   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
    1531             : 
    1532          72 :   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1533             : 
    1534          72 :   SDValue One = DAG.getConstant(1, DL, HalfVT);
    1535          72 :   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
    1536             : 
    1537             :   //HiLo split
    1538          72 :   SDValue LHS = Op.getOperand(0);
    1539          72 :   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1540          72 :   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
    1541             : 
    1542          72 :   SDValue RHS = Op.getOperand(1);
    1543          72 :   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1544          72 :   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
    1545             : 
    1546         160 :   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
    1547         144 :       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
    1548             : 
    1549             :     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1550          16 :                               LHS_Lo, RHS_Lo);
    1551             : 
    1552          32 :     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
    1553          32 :     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
    1554             : 
    1555          32 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
    1556          16 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
    1557             :     return;
    1558             :   }
    1559             : 
    1560          56 :   if (isTypeLegal(MVT::i64)) {
    1561             :     // Compute denominator reciprocal.
    1562          38 :     unsigned FMAD = Subtarget->hasFP32Denormals() ?
    1563             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1564             :                     (unsigned)ISD::FMAD;
    1565             : 
    1566          38 :     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
    1567          38 :     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
    1568             :     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
    1569          38 :       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
    1570         114 :       Cvt_Lo);
    1571          38 :     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
    1572             :     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
    1573         114 :       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
    1574             :     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
    1575         114 :       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
    1576          38 :     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
    1577             :     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
    1578          38 :       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
    1579         114 :       Mul1);
    1580          38 :     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
    1581          38 :     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
    1582             :     SDValue Rcp64 = DAG.getBitcast(VT,
    1583          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
    1584             : 
    1585          38 :     SDValue Zero64 = DAG.getConstant(0, DL, VT);
    1586          38 :     SDValue One64  = DAG.getConstant(1, DL, VT);
    1587          38 :     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
    1588          38 :     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
    1589             : 
    1590          38 :     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
    1591          38 :     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
    1592          38 :     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
    1593             :     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1594          38 :                                     Zero);
    1595             :     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1596          38 :                                     One);
    1597             : 
    1598             :     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
    1599          38 :                                   Mulhi1_Lo, Zero1);
    1600             :     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
    1601          38 :                                   Mulhi1_Hi, Add1_Lo.getValue(1));
    1602          38 :     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
    1603             :     SDValue Add1 = DAG.getBitcast(VT,
    1604          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
    1605             : 
    1606          38 :     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
    1607          38 :     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
    1608             :     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1609          38 :                                     Zero);
    1610             :     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1611          38 :                                     One);
    1612             : 
    1613             :     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
    1614          38 :                                   Mulhi2_Lo, Zero1);
    1615             :     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
    1616          38 :                                    Mulhi2_Hi, Add1_Lo.getValue(1));
    1617             :     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
    1618          38 :                                   Zero, Add2_Lo.getValue(1));
    1619             :     SDValue Add2 = DAG.getBitcast(VT,
    1620          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
    1621          38 :     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
    1622             : 
    1623          38 :     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
    1624             : 
    1625          38 :     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
    1626          38 :     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
    1627             :     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
    1628          38 :                                   Mul3_Lo, Zero1);
    1629             :     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
    1630          38 :                                   Mul3_Hi, Sub1_Lo.getValue(1));
    1631          38 :     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
    1632             :     SDValue Sub1 = DAG.getBitcast(VT,
    1633          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
    1634             : 
    1635          38 :     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
    1636             :     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
    1637          38 :                                  ISD::SETUGE);
    1638             :     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
    1639          38 :                                  ISD::SETUGE);
    1640          38 :     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
    1641             : 
    1642             :     // TODO: Here and below portions of the code can be enclosed into if/endif.
    1643             :     // Currently control flow is unconditional and we have 4 selects after
    1644             :     // potential endif to substitute PHIs.
    1645             : 
    1646             :     // if C3 != 0 ...
    1647             :     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
    1648          38 :                                   RHS_Lo, Zero1);
    1649             :     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
    1650          38 :                                   RHS_Hi, Sub1_Lo.getValue(1));
    1651             :     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1652          38 :                                   Zero, Sub2_Lo.getValue(1));
    1653             :     SDValue Sub2 = DAG.getBitcast(VT,
    1654          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
    1655             : 
    1656          38 :     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
    1657             : 
    1658             :     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
    1659          38 :                                  ISD::SETUGE);
    1660             :     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
    1661          38 :                                  ISD::SETUGE);
    1662          38 :     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
    1663             : 
    1664             :     // if (C6 != 0)
    1665          38 :     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
    1666             : 
    1667             :     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
    1668          38 :                                   RHS_Lo, Zero1);
    1669             :     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1670          38 :                                   RHS_Hi, Sub2_Lo.getValue(1));
    1671             :     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
    1672          38 :                                   Zero, Sub3_Lo.getValue(1));
    1673             :     SDValue Sub3 = DAG.getBitcast(VT,
    1674          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
    1675             : 
    1676             :     // endif C6
    1677             :     // endif C3
    1678             : 
    1679          38 :     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
    1680          38 :     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
    1681             : 
    1682          38 :     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
    1683          38 :     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
    1684             : 
    1685          38 :     Results.push_back(Div);
    1686          38 :     Results.push_back(Rem);
    1687             : 
    1688             :     return;
    1689             :   }
    1690             : 
    1691             :   // r600 expandion.
    1692             :   // Get Speculative values
    1693          18 :   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    1694          18 :   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
    1695             : 
    1696          18 :   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
    1697          36 :   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
    1698          18 :   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
    1699             : 
    1700          18 :   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
    1701          18 :   SDValue DIV_Lo = Zero;
    1702             : 
    1703          18 :   const unsigned halfBitWidth = HalfVT.getSizeInBits();
    1704             : 
    1705        1170 :   for (unsigned i = 0; i < halfBitWidth; ++i) {
    1706         576 :     const unsigned bitPos = halfBitWidth - i - 1;
    1707         576 :     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
    1708             :     // Get value of high bit
    1709         576 :     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
    1710         576 :     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
    1711         576 :     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
    1712             : 
    1713             :     // Shift
    1714         576 :     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
    1715             :     // Add LHS high bit
    1716         576 :     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
    1717             : 
    1718         576 :     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
    1719         576 :     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
    1720             : 
    1721         576 :     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
    1722             : 
    1723             :     // Update REM
    1724         576 :     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
    1725         576 :     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
    1726             :   }
    1727             : 
    1728          36 :   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
    1729          18 :   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
    1730          18 :   Results.push_back(DIV);
    1731          18 :   Results.push_back(REM);
    1732             : }
    1733             : 
    1734         326 : SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
    1735             :                                            SelectionDAG &DAG) const {
    1736             :   SDLoc DL(Op);
    1737         326 :   EVT VT = Op.getValueType();
    1738             : 
    1739             :   if (VT == MVT::i64) {
    1740             :     SmallVector<SDValue, 2> Results;
    1741          50 :     LowerUDIVREM64(Op, DAG, Results);
    1742          50 :     return DAG.getMergeValues(Results, DL);
    1743             :   }
    1744             : 
    1745             :   if (VT == MVT::i32) {
    1746         276 :     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
    1747          41 :       return Res;
    1748             :   }
    1749             : 
    1750         235 :   SDValue Num = Op.getOperand(0);
    1751         235 :   SDValue Den = Op.getOperand(1);
    1752             : 
    1753             :   // RCP =  URECIP(Den) = 2^32 / Den + e
    1754             :   // e is rounding error.
    1755         235 :   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
    1756             : 
    1757             :   // RCP_LO = mul(RCP, Den) */
    1758         235 :   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
    1759             : 
    1760             :   // RCP_HI = mulhu (RCP, Den) */
    1761         235 :   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
    1762             : 
    1763             :   // NEG_RCP_LO = -RCP_LO
    1764             :   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
    1765         235 :                                                      RCP_LO);
    1766             : 
    1767             :   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
    1768             :   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1769             :                                            NEG_RCP_LO, RCP_LO,
    1770         235 :                                            ISD::SETEQ);
    1771             :   // Calculate the rounding error from the URECIP instruction
    1772             :   // E = mulhu(ABS_RCP_LO, RCP)
    1773         235 :   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
    1774             : 
    1775             :   // RCP_A_E = RCP + E
    1776         235 :   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
    1777             : 
    1778             :   // RCP_S_E = RCP - E
    1779         235 :   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
    1780             : 
    1781             :   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
    1782             :   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1783             :                                      RCP_A_E, RCP_S_E,
    1784         235 :                                      ISD::SETEQ);
    1785             :   // Quotient = mulhu(Tmp0, Num)
    1786         235 :   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
    1787             : 
    1788             :   // Num_S_Remainder = Quotient * Den
    1789         235 :   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
    1790             : 
    1791             :   // Remainder = Num - Num_S_Remainder
    1792         235 :   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
    1793             : 
    1794             :   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
    1795             :   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
    1796             :                                                  DAG.getConstant(-1, DL, VT),
    1797             :                                                  DAG.getConstant(0, DL, VT),
    1798         235 :                                                  ISD::SETUGE);
    1799             :   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
    1800             :   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
    1801             :                                                   Num_S_Remainder,
    1802             :                                                   DAG.getConstant(-1, DL, VT),
    1803             :                                                   DAG.getConstant(0, DL, VT),
    1804         235 :                                                   ISD::SETUGE);
    1805             :   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
    1806             :   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
    1807         235 :                                                Remainder_GE_Zero);
    1808             : 
    1809             :   // Calculate Division result:
    1810             : 
    1811             :   // Quotient_A_One = Quotient + 1
    1812             :   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
    1813         235 :                                        DAG.getConstant(1, DL, VT));
    1814             : 
    1815             :   // Quotient_S_One = Quotient - 1
    1816             :   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
    1817         235 :                                        DAG.getConstant(1, DL, VT));
    1818             : 
    1819             :   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
    1820             :   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1821         235 :                                      Quotient, Quotient_A_One, ISD::SETEQ);
    1822             : 
    1823             :   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
    1824         235 :   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1825         235 :                             Quotient_S_One, Div, ISD::SETEQ);
    1826             : 
    1827             :   // Calculate Rem result:
    1828             : 
    1829             :   // Remainder_S_Den = Remainder - Den
    1830         235 :   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
    1831             : 
    1832             :   // Remainder_A_Den = Remainder + Den
    1833         235 :   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
    1834             : 
    1835             :   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
    1836             :   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1837         235 :                                     Remainder, Remainder_S_Den, ISD::SETEQ);
    1838             : 
    1839             :   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
    1840         235 :   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1841         235 :                             Remainder_A_Den, Rem, ISD::SETEQ);
    1842             :   SDValue Ops[2] = {
    1843             :     Div,
    1844             :     Rem
    1845         235 :   };
    1846         235 :   return DAG.getMergeValues(Ops, DL);
    1847             : }
    1848             : 
    1849         170 : SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
    1850             :                                            SelectionDAG &DAG) const {
    1851             :   SDLoc DL(Op);
    1852         170 :   EVT VT = Op.getValueType();
    1853             : 
    1854         170 :   SDValue LHS = Op.getOperand(0);
    1855         170 :   SDValue RHS = Op.getOperand(1);
    1856             : 
    1857         170 :   SDValue Zero = DAG.getConstant(0, DL, VT);
    1858         170 :   SDValue NegOne = DAG.getConstant(-1, DL, VT);
    1859             : 
    1860             :   if (VT == MVT::i32) {
    1861         134 :     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
    1862          48 :       return Res;
    1863             :   }
    1864             : 
    1865          36 :   if (VT == MVT::i64 &&
    1866          48 :       DAG.ComputeNumSignBits(LHS) > 32 &&
    1867          12 :       DAG.ComputeNumSignBits(RHS) > 32) {
    1868          12 :     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1869             : 
    1870             :     //HiLo split
    1871          12 :     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1872          12 :     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1873             :     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1874          12 :                                  LHS_Lo, RHS_Lo);
    1875             :     SDValue Res[2] = {
    1876          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
    1877          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
    1878          24 :     };
    1879          12 :     return DAG.getMergeValues(Res, DL);
    1880             :   }
    1881             : 
    1882         110 :   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
    1883         110 :   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
    1884         110 :   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
    1885         110 :   SDValue RSign = LHSign; // Remainder sign is the same as LHS
    1886             : 
    1887         110 :   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
    1888         110 :   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
    1889             : 
    1890         110 :   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
    1891         110 :   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
    1892             : 
    1893         110 :   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
    1894         110 :   SDValue Rem = Div.getValue(1);
    1895             : 
    1896         110 :   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
    1897         110 :   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
    1898             : 
    1899         110 :   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
    1900         110 :   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
    1901             : 
    1902             :   SDValue Res[2] = {
    1903             :     Div,
    1904             :     Rem
    1905         110 :   };
    1906         110 :   return DAG.getMergeValues(Res, DL);
    1907             : }
    1908             : 
    1909             : // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
    1910          36 : SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
    1911             :   SDLoc SL(Op);
    1912          36 :   EVT VT = Op.getValueType();
    1913          36 :   SDValue X = Op.getOperand(0);
    1914          36 :   SDValue Y = Op.getOperand(1);
    1915             : 
    1916             :   // TODO: Should this propagate fast-math-flags?
    1917             : 
    1918          36 :   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
    1919          36 :   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
    1920          36 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
    1921             : 
    1922          72 :   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
    1923             : }
    1924             : 
    1925          31 : SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
    1926             :   SDLoc SL(Op);
    1927          31 :   SDValue Src = Op.getOperand(0);
    1928             : 
    1929             :   // result = trunc(src)
    1930             :   // if (src > 0.0 && src != result)
    1931             :   //   result += 1.0
    1932             : 
    1933          31 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    1934             : 
    1935          31 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    1936          31 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    1937             : 
    1938             :   EVT SetCCVT =
    1939          93 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    1940             : 
    1941          31 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
    1942          31 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    1943          31 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    1944             : 
    1945          31 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
    1946             :   // TODO: Should this propagate fast-math-flags?
    1947          62 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    1948             : }
    1949             : 
    1950          91 : static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
    1951             :                                   SelectionDAG &DAG) {
    1952             :   const unsigned FractBits = 52;
    1953             :   const unsigned ExpBits = 11;
    1954             : 
    1955             :   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    1956             :                                 Hi,
    1957             :                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
    1958         273 :                                 DAG.getConstant(ExpBits, SL, MVT::i32));
    1959             :   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
    1960         182 :                             DAG.getConstant(1023, SL, MVT::i32));
    1961             : 
    1962          91 :   return Exp;
    1963             : }
    1964             : 
    1965          75 : SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    1966             :   SDLoc SL(Op);
    1967          75 :   SDValue Src = Op.getOperand(0);
    1968             : 
    1969             :   assert(Op.getValueType() == MVT::f64);
    1970             : 
    1971          75 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1972          75 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1973             : 
    1974          75 :   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    1975             : 
    1976             :   // Extract the upper half, since this is where we will find the sign and
    1977             :   // exponent.
    1978          75 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
    1979             : 
    1980          75 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    1981             : 
    1982             :   const unsigned FractBits = 52;
    1983             : 
    1984             :   // Extract the sign bit.
    1985          75 :   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
    1986          75 :   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
    1987             : 
    1988             :   // Extend back to 64-bits.
    1989         150 :   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
    1990          75 :   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
    1991             : 
    1992          75 :   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
    1993             :   const SDValue FractMask
    1994          75 :     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
    1995             : 
    1996          75 :   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
    1997          75 :   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
    1998          75 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
    1999             : 
    2000             :   EVT SetCCVT =
    2001         225 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2002             : 
    2003          75 :   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
    2004             : 
    2005          75 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2006          75 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2007             : 
    2008          75 :   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
    2009          75 :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
    2010             : 
    2011         150 :   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
    2012             : }
    2013             : 
    2014          14 : SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
    2015             :   SDLoc SL(Op);
    2016          14 :   SDValue Src = Op.getOperand(0);
    2017             : 
    2018             :   assert(Op.getValueType() == MVT::f64);
    2019             : 
    2020          14 :   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
    2021          14 :   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
    2022          14 :   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
    2023             : 
    2024             :   // TODO: Should this propagate fast-math-flags?
    2025             : 
    2026          14 :   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
    2027          14 :   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
    2028             : 
    2029          14 :   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
    2030             : 
    2031          14 :   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
    2032          14 :   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
    2033             : 
    2034             :   EVT SetCCVT =
    2035          42 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2036          14 :   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
    2037             : 
    2038          28 :   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
    2039             : }
    2040             : 
    2041          45 : SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
    2042             :   // FNEARBYINT and FRINT are the same, except in their handling of FP
    2043             :   // exceptions. Those aren't really meaningful for us, and OpenCL only has
    2044             :   // rint, so just treat them as equivalent.
    2045          90 :   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
    2046             : }
    2047             : 
    2048             : // XXX - May require not supporting f32 denormals?
    2049             : 
    2050             : // Don't handle v2f16. The extra instructions to scalarize and repack around the
    2051             : // compare and vselect end up producing worse code than scalarizing the whole
    2052             : // operation.
    2053          74 : SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
    2054             :   SDLoc SL(Op);
    2055          74 :   SDValue X = Op.getOperand(0);
    2056          74 :   EVT VT = Op.getValueType();
    2057             : 
    2058          74 :   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
    2059             : 
    2060             :   // TODO: Should this propagate fast-math-flags?
    2061             : 
    2062          74 :   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
    2063             : 
    2064          74 :   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
    2065             : 
    2066          74 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
    2067          74 :   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
    2068          74 :   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
    2069             : 
    2070          74 :   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
    2071             : 
    2072             :   EVT SetCCVT =
    2073         148 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    2074             : 
    2075          74 :   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
    2076             : 
    2077          74 :   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
    2078             : 
    2079         148 :   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
    2080             : }
    2081             : 
    2082          16 : SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
    2083             :   SDLoc SL(Op);
    2084          16 :   SDValue X = Op.getOperand(0);
    2085             : 
    2086          16 :   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
    2087             : 
    2088          16 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2089          16 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2090          16 :   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
    2091          16 :   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
    2092             :   EVT SetCCVT =
    2093          48 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2094             : 
    2095          16 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    2096             : 
    2097          16 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
    2098             : 
    2099          16 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    2100             : 
    2101             :   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
    2102          16 :                                        MVT::i64);
    2103             : 
    2104          16 :   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
    2105             :   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
    2106             :                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
    2107             :                                           MVT::i64),
    2108          32 :                           Exp);
    2109             : 
    2110          16 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
    2111             :   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
    2112             :                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
    2113          16 :                               ISD::SETNE);
    2114             : 
    2115             :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
    2116          32 :                              D, DAG.getConstant(0, SL, MVT::i64));
    2117          16 :   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
    2118             : 
    2119          32 :   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
    2120          16 :   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
    2121             : 
    2122          16 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2123          16 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2124          16 :   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
    2125             : 
    2126             :   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
    2127             :                             ExpEqNegOne,
    2128             :                             DAG.getConstantFP(1.0, SL, MVT::f64),
    2129          48 :                             DAG.getConstantFP(0.0, SL, MVT::f64));
    2130             : 
    2131          16 :   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
    2132             : 
    2133          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
    2134          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
    2135             : 
    2136          32 :   return K;
    2137             : }
    2138             : 
    2139          90 : SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
    2140          90 :   EVT VT = Op.getValueType();
    2141             : 
    2142             :   if (VT == MVT::f32 || VT == MVT::f16)
    2143          74 :     return LowerFROUND32_16(Op, DAG);
    2144             : 
    2145             :   if (VT == MVT::f64)
    2146          16 :     return LowerFROUND64(Op, DAG);
    2147             : 
    2148           0 :   llvm_unreachable("unhandled type");
    2149             : }
    2150             : 
    2151           0 : SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
    2152             :   SDLoc SL(Op);
    2153           0 :   SDValue Src = Op.getOperand(0);
    2154             : 
    2155             :   // result = trunc(src);
    2156             :   // if (src < 0.0 && src != result)
    2157             :   //   result += -1.0.
    2158             : 
    2159           0 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2160             : 
    2161           0 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    2162           0 :   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
    2163             : 
    2164             :   EVT SetCCVT =
    2165           0 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2166             : 
    2167           0 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
    2168           0 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    2169           0 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    2170             : 
    2171           0 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
    2172             :   // TODO: Should this propagate fast-math-flags?
    2173           0 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    2174             : }
    2175             : 
    2176          74 : SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
    2177             :                                         double Log2BaseInverted) const {
    2178          74 :   EVT VT = Op.getValueType();
    2179             : 
    2180             :   SDLoc SL(Op);
    2181          74 :   SDValue Operand = Op.getOperand(0);
    2182          74 :   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
    2183          74 :   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
    2184             : 
    2185         148 :   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
    2186             : }
    2187             : 
    2188             : static bool isCtlzOpc(unsigned Opc) {
    2189        3348 :   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
    2190             : }
    2191             : 
    2192             : static bool isCttzOpc(unsigned Opc) {
    2193        4201 :   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
    2194             : }
    2195             : 
    2196         415 : SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
    2197             :   SDLoc SL(Op);
    2198         415 :   SDValue Src = Op.getOperand(0);
    2199         415 :   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
    2200             :                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
    2201             : 
    2202             :   unsigned ISDOpc, NewOpc;
    2203         415 :   if (isCtlzOpc(Op.getOpcode())) {
    2204             :     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
    2205             :     NewOpc = AMDGPUISD::FFBH_U32;
    2206          73 :   } else if (isCttzOpc(Op.getOpcode())) {
    2207             :     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
    2208             :     NewOpc = AMDGPUISD::FFBL_B32;
    2209             :   } else
    2210           0 :     llvm_unreachable("Unexpected OPCode!!!");
    2211             : 
    2212             : 
    2213         415 :   if (ZeroUndef && Src.getValueType() == MVT::i32)
    2214         329 :     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
    2215             : 
    2216          86 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2217             : 
    2218          86 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2219          86 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2220             : 
    2221          86 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    2222          86 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    2223             : 
    2224             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2225         258 :                                    *DAG.getContext(), MVT::i32);
    2226             : 
    2227         172 :   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
    2228          86 :   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
    2229             : 
    2230          86 :   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
    2231          86 :   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
    2232             : 
    2233          86 :   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
    2234          86 :   SDValue Add, NewOpr;
    2235          86 :   if (isCtlzOpc(Op.getOpcode())) {
    2236          82 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
    2237             :     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
    2238          82 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
    2239             :   } else {
    2240           4 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
    2241             :     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
    2242           4 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
    2243             :   }
    2244             : 
    2245          86 :   if (!ZeroUndef) {
    2246             :     // Test if the full 64-bit input is zero.
    2247             : 
    2248             :     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
    2249             :     // which we probably don't want.
    2250           8 :     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
    2251           8 :     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
    2252           8 :     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
    2253             : 
    2254             :     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
    2255             :     // with the same cycles, otherwise it is slower.
    2256             :     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
    2257             :     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
    2258             : 
    2259           8 :     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
    2260             : 
    2261             :     // The instruction returns -1 for 0 input, but the defined intrinsic
    2262             :     // behavior is to return the number of bits.
    2263           8 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
    2264           8 :                          SrcIsZero, Bits32, NewOpr);
    2265             :   }
    2266             : 
    2267          86 :   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
    2268             : }
    2269             : 
    2270          67 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
    2271             :                                                bool Signed) const {
    2272             :   // Unsigned
    2273             :   // cul2f(ulong u)
    2274             :   //{
    2275             :   //  uint lz = clz(u);
    2276             :   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
    2277             :   //  u = (u << lz) & 0x7fffffffffffffffUL;
    2278             :   //  ulong t = u & 0xffffffffffUL;
    2279             :   //  uint v = (e << 23) | (uint)(u >> 40);
    2280             :   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
    2281             :   //  return as_float(v + r);
    2282             :   //}
    2283             :   // Signed
    2284             :   // cl2f(long l)
    2285             :   //{
    2286             :   //  long s = l >> 63;
    2287             :   //  float r = cul2f((l + s) ^ s);
    2288             :   //  return s ? -r : r;
    2289             :   //}
    2290             : 
    2291             :   SDLoc SL(Op);
    2292          67 :   SDValue Src = Op.getOperand(0);
    2293          67 :   SDValue L = Src;
    2294             : 
    2295          67 :   SDValue S;
    2296          67 :   if (Signed) {
    2297          32 :     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
    2298          32 :     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
    2299             : 
    2300          32 :     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
    2301          32 :     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
    2302             :   }
    2303             : 
    2304             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2305         201 :                                    *DAG.getContext(), MVT::f32);
    2306             : 
    2307             : 
    2308          67 :   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
    2309          67 :   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
    2310          67 :   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
    2311          67 :   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
    2312             : 
    2313          67 :   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
    2314             :   SDValue E = DAG.getSelect(SL, MVT::i32,
    2315             :     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
    2316             :     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
    2317         134 :     ZeroI32);
    2318             : 
    2319             :   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
    2320             :     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
    2321         201 :     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
    2322             : 
    2323             :   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
    2324         134 :                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
    2325             : 
    2326             :   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
    2327         134 :                              U, DAG.getConstant(40, SL, MVT::i64));
    2328             : 
    2329             :   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
    2330             :     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
    2331         268 :     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
    2332             : 
    2333          67 :   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
    2334          67 :   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
    2335          67 :   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
    2336             : 
    2337          67 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2338             : 
    2339          67 :   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
    2340             : 
    2341             :   SDValue R = DAG.getSelect(SL, MVT::i32,
    2342             :     RCmp,
    2343             :     One,
    2344         134 :     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
    2345          67 :   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
    2346          67 :   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
    2347             : 
    2348          67 :   if (!Signed)
    2349          35 :     return R;
    2350             : 
    2351          32 :   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
    2352          64 :   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
    2353             : }
    2354             : 
    2355          10 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
    2356             :                                                bool Signed) const {
    2357             :   SDLoc SL(Op);
    2358          10 :   SDValue Src = Op.getOperand(0);
    2359             : 
    2360          10 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2361             : 
    2362             :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2363          20 :                            DAG.getConstant(0, SL, MVT::i32));
    2364             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2365          20 :                            DAG.getConstant(1, SL, MVT::i32));
    2366             : 
    2367             :   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
    2368          10 :                               SL, MVT::f64, Hi);
    2369             : 
    2370          10 :   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
    2371             : 
    2372             :   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
    2373          20 :                               DAG.getConstant(32, SL, MVT::i32));
    2374             :   // TODO: Should this propagate fast-math-flags?
    2375          20 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
    2376             : }
    2377             : 
    2378          51 : SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
    2379             :                                                SelectionDAG &DAG) const {
    2380             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2381             :          "operation should be legal");
    2382             : 
    2383             :   // TODO: Factor out code common with LowerSINT_TO_FP.
    2384             : 
    2385             :   EVT DestVT = Op.getValueType();
    2386          51 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2387             :     SDLoc DL(Op);
    2388           8 :     SDValue Src = Op.getOperand(0);
    2389             : 
    2390           8 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2391          16 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2392             :     SDValue FPRound =
    2393           8 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2394             : 
    2395           8 :     return FPRound;
    2396             :   }
    2397             : 
    2398             :   if (DestVT == MVT::f32)
    2399          35 :     return LowerINT_TO_FP32(Op, DAG, false);
    2400             : 
    2401             :   assert(DestVT == MVT::f64);
    2402           8 :   return LowerINT_TO_FP64(Op, DAG, false);
    2403             : }
    2404             : 
    2405          42 : SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
    2406             :                                               SelectionDAG &DAG) const {
    2407             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2408             :          "operation should be legal");
    2409             : 
    2410             :   // TODO: Factor out code common with LowerUINT_TO_FP.
    2411             : 
    2412             :   EVT DestVT = Op.getValueType();
    2413          42 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2414             :     SDLoc DL(Op);
    2415           8 :     SDValue Src = Op.getOperand(0);
    2416             : 
    2417           8 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2418          16 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2419             :     SDValue FPRound =
    2420           8 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2421             : 
    2422           8 :     return FPRound;
    2423             :   }
    2424             : 
    2425             :   if (DestVT == MVT::f32)
    2426          32 :     return LowerINT_TO_FP32(Op, DAG, true);
    2427             : 
    2428             :   assert(DestVT == MVT::f64);
    2429           2 :   return LowerINT_TO_FP64(Op, DAG, true);
    2430             : }
    2431             : 
    2432          16 : SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
    2433             :                                                bool Signed) const {
    2434             :   SDLoc SL(Op);
    2435             : 
    2436          16 :   SDValue Src = Op.getOperand(0);
    2437             : 
    2438          16 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2439             : 
    2440             :   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
    2441          16 :                                  MVT::f64);
    2442             :   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
    2443          16 :                                  MVT::f64);
    2444             :   // TODO: Should this propagate fast-math-flags?
    2445          16 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
    2446             : 
    2447          16 :   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
    2448             : 
    2449             : 
    2450          16 :   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
    2451             : 
    2452             :   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
    2453          16 :                            MVT::i32, FloorMul);
    2454          16 :   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
    2455             : 
    2456          32 :   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
    2457             : 
    2458          32 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
    2459             : }
    2460             : 
    2461         716 : SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
    2462             :   SDLoc DL(Op);
    2463         716 :   SDValue N0 = Op.getOperand(0);
    2464             : 
    2465             :   // Convert to target node to get known bits
    2466             :   if (N0.getValueType() == MVT::f32)
    2467         667 :     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
    2468             : 
    2469          49 :   if (getTargetMachine().Options.UnsafeFPMath) {
    2470             :     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
    2471          10 :     return SDValue();
    2472             :   }
    2473             : 
    2474             :   assert(N0.getSimpleValueType() == MVT::f64);
    2475             : 
    2476             :   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
    2477             :   const unsigned ExpMask = 0x7ff;
    2478             :   const unsigned ExpBiasf64 = 1023;
    2479             :   const unsigned ExpBiasf16 = 15;
    2480          39 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    2481          39 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    2482          39 :   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
    2483             :   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
    2484          78 :                            DAG.getConstant(32, DL, MVT::i64));
    2485          39 :   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
    2486          39 :   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
    2487             :   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2488          78 :                           DAG.getConstant(20, DL, MVT::i64));
    2489          39 :   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
    2490          78 :                   DAG.getConstant(ExpMask, DL, MVT::i32));
    2491             :   // Subtract the fp64 exponent bias (1023) to get the real exponent and
    2492             :   // add the f16 bias (15) to get the biased exponent for the f16 format.
    2493          39 :   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
    2494          78 :                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
    2495             : 
    2496             :   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2497          78 :                           DAG.getConstant(8, DL, MVT::i32));
    2498          39 :   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
    2499          78 :                   DAG.getConstant(0xffe, DL, MVT::i32));
    2500             : 
    2501             :   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
    2502          78 :                                   DAG.getConstant(0x1ff, DL, MVT::i32));
    2503          39 :   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
    2504             : 
    2505          39 :   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
    2506          39 :   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
    2507             : 
    2508             :   // (M != 0 ? 0x0200 : 0) | 0x7c00;
    2509             :   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
    2510             :       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
    2511         117 :                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
    2512             : 
    2513             :   // N = M | (E << 12);
    2514             :   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2515             :       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
    2516         117 :                   DAG.getConstant(12, DL, MVT::i32)));
    2517             : 
    2518             :   // B = clamp(1-E, 0, 13);
    2519             :   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
    2520          39 :                                   One, E);
    2521          39 :   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
    2522          39 :   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
    2523          78 :                   DAG.getConstant(13, DL, MVT::i32));
    2524             : 
    2525             :   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2526          78 :                                    DAG.getConstant(0x1000, DL, MVT::i32));
    2527             : 
    2528          39 :   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
    2529          39 :   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
    2530          39 :   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
    2531          39 :   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
    2532             : 
    2533          39 :   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
    2534             :   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
    2535          78 :                               DAG.getConstant(0x7, DL, MVT::i32));
    2536          39 :   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
    2537          78 :                   DAG.getConstant(2, DL, MVT::i32));
    2538             :   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
    2539          39 :                                One, Zero, ISD::SETEQ);
    2540             :   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
    2541          39 :                                One, Zero, ISD::SETGT);
    2542          39 :   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
    2543          39 :   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
    2544             : 
    2545          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
    2546         117 :                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
    2547          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
    2548          78 :                       I, V, ISD::SETEQ);
    2549             : 
    2550             :   // Extract the sign bit.
    2551             :   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2552          78 :                             DAG.getConstant(16, DL, MVT::i32));
    2553          39 :   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
    2554          78 :                      DAG.getConstant(0x8000, DL, MVT::i32));
    2555             : 
    2556          39 :   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
    2557          39 :   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
    2558             : }
    2559             : 
    2560          65 : SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
    2561             :                                               SelectionDAG &DAG) const {
    2562          65 :   SDValue Src = Op.getOperand(0);
    2563             : 
    2564             :   // TODO: Factor out code common with LowerFP_TO_UINT.
    2565             : 
    2566             :   EVT SrcVT = Src.getValueType();
    2567          65 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2568             :     SDLoc DL(Op);
    2569             : 
    2570           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2571             :     SDValue FpToInt32 =
    2572           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2573             : 
    2574           3 :     return FpToInt32;
    2575             :   }
    2576             : 
    2577             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2578           2 :     return LowerFP64_TO_INT(Op, DAG, true);
    2579             : 
    2580          60 :   return SDValue();
    2581             : }
    2582             : 
    2583          37 : SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
    2584             :                                               SelectionDAG &DAG) const {
    2585          37 :   SDValue Src = Op.getOperand(0);
    2586             : 
    2587             :   // TODO: Factor out code common with LowerFP_TO_SINT.
    2588             : 
    2589             :   EVT SrcVT = Src.getValueType();
    2590          37 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2591             :     SDLoc DL(Op);
    2592             : 
    2593           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2594             :     SDValue FpToInt32 =
    2595           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2596             : 
    2597           3 :     return FpToInt32;
    2598             :   }
    2599             : 
    2600             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2601          14 :     return LowerFP64_TO_INT(Op, DAG, false);
    2602             : 
    2603          20 :   return SDValue();
    2604             : }
    2605             : 
    2606          30 : SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    2607             :                                                      SelectionDAG &DAG) const {
    2608          30 :   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
    2609          30 :   MVT VT = Op.getSimpleValueType();
    2610             :   MVT ScalarVT = VT.getScalarType();
    2611             : 
    2612             :   assert(VT.isVector());
    2613             : 
    2614          30 :   SDValue Src = Op.getOperand(0);
    2615             :   SDLoc DL(Op);
    2616             : 
    2617             :   // TODO: Don't scalarize on Evergreen?
    2618          30 :   unsigned NElts = VT.getVectorNumElements();
    2619             :   SmallVector<SDValue, 8> Args;
    2620          30 :   DAG.ExtractVectorElements(Src, Args, 0, NElts);
    2621             : 
    2622          30 :   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
    2623         214 :   for (unsigned I = 0; I < NElts; ++I)
    2624         276 :     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
    2625             : 
    2626          60 :   return DAG.getBuildVector(VT, DL, Args);
    2627             : }
    2628             : 
    2629             : //===----------------------------------------------------------------------===//
    2630             : // Custom DAG optimizations
    2631             : //===----------------------------------------------------------------------===//
    2632             : 
    2633             : static bool isU24(SDValue Op, SelectionDAG &DAG) {
    2634        8603 :   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
    2635             : }
    2636             : 
    2637        4266 : static bool isI24(SDValue Op, SelectionDAG &DAG) {
    2638        8532 :   EVT VT = Op.getValueType();
    2639        8532 :   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
    2640             :                                      // as unsigned 24-bit values.
    2641        8532 :     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
    2642             : }
    2643             : 
    2644        3059 : static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
    2645             :                         TargetLowering::DAGCombinerInfo &DCI) {
    2646             : 
    2647        3059 :   SelectionDAG &DAG = DCI.DAG;
    2648        6118 :   SDValue Op = Node24->getOperand(OpIdx);
    2649        3059 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    2650        3059 :   EVT VT = Op.getValueType();
    2651             : 
    2652        3059 :   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
    2653             :   APInt KnownZero, KnownOne;
    2654             :   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
    2655        3059 :   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
    2656             :     return true;
    2657             : 
    2658        2887 :   return false;
    2659             : }
    2660             : 
    2661             : template <typename IntTy>
    2662          48 : static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
    2663             :                                uint32_t Width, const SDLoc &DL) {
    2664          48 :   if (Width + Offset < 32) {
    2665          20 :     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
    2666          20 :     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
    2667          20 :     return DAG.getConstant(Result, DL, MVT::i32);
    2668             :   }
    2669             : 
    2670          28 :   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
    2671             : }
    2672             : 
    2673       86402 : static bool hasVolatileUser(SDNode *Val) {
    2674      262967 :   for (SDNode *U : Val->uses()) {
    2675       46923 :     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
    2676       46923 :       if (M->isVolatile())
    2677             :         return true;
    2678             :     }
    2679             :   }
    2680             : 
    2681             :   return false;
    2682             : }
    2683             : 
    2684      125643 : bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
    2685             :   // i32 vectors are the canonical memory type.
    2686      325238 :   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
    2687             :     return false;
    2688             : 
    2689       12397 :   if (!VT.isByteSized())
    2690             :     return false;
    2691             : 
    2692             :   unsigned Size = VT.getStoreSize();
    2693             : 
    2694       20216 :   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
    2695             :     return false;
    2696             : 
    2697        4977 :   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
    2698             :     return false;
    2699             : 
    2700        4368 :   return true;
    2701             : }
    2702             : 
    2703             : // Replace load of an illegal type with a store of a bitcast to a friendlier
    2704             : // type.
    2705      136368 : SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
    2706             :                                                  DAGCombinerInfo &DCI) const {
    2707      136368 :   if (!DCI.isBeforeLegalize())
    2708       34107 :     return SDValue();
    2709             : 
    2710             :   LoadSDNode *LN = cast<LoadSDNode>(N);
    2711      188663 :   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
    2712       19638 :     return SDValue();
    2713             : 
    2714             :   SDLoc SL(N);
    2715       82623 :   SelectionDAG &DAG = DCI.DAG;
    2716       82623 :   EVT VT = LN->getMemoryVT();
    2717             : 
    2718             :   unsigned Size = VT.getStoreSize();
    2719       82623 :   unsigned Align = LN->getAlignment();
    2720       82623 :   if (Align < Size && isTypeLegal(VT)) {
    2721             :     bool IsFast;
    2722             :     unsigned AS = LN->getAddressSpace();
    2723             : 
    2724             :     // Expand unaligned loads earlier than legalization. Due to visitation order
    2725             :     // problems during legalization, the emitted instructions to pack and unpack
    2726             :     // the bytes again are not eliminated in the case of an unaligned copy.
    2727        4381 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2728         315 :       if (VT.isVector())
    2729          56 :         return scalarizeVectorLoad(LN, DAG);
    2730             : 
    2731         259 :       SDValue Ops[2];
    2732         518 :       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
    2733         518 :       return DAG.getMergeValues(Ops, SDLoc(N));
    2734             :     }
    2735             : 
    2736        4066 :     if (!IsFast)
    2737          20 :       return SDValue();
    2738             :   }
    2739             : 
    2740       82288 :   if (!shouldCombineMemoryType(VT))
    2741       80298 :     return SDValue();
    2742             : 
    2743        1990 :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2744             : 
    2745             :   SDValue NewLoad
    2746             :     = DAG.getLoad(NewVT, SL, LN->getChain(),
    2747        3980 :                   LN->getBasePtr(), LN->getMemOperand());
    2748             : 
    2749        1990 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
    2750        1990 :   DCI.CombineTo(N, BC, NewLoad.getValue(1));
    2751        1990 :   return SDValue(N, 0);
    2752             : }
    2753             : 
    2754             : // Replace store of an illegal type with a store of a bitcast to a friendlier
    2755             : // type.
    2756       90296 : SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
    2757             :                                                   DAGCombinerInfo &DCI) const {
    2758       90296 :   if (!DCI.isBeforeLegalize())
    2759       34254 :     return SDValue();
    2760             : 
    2761             :   StoreSDNode *SN = cast<StoreSDNode>(N);
    2762       56042 :   if (SN->isVolatile() || !ISD::isNormalStore(SN))
    2763       12379 :     return SDValue();
    2764             : 
    2765       43663 :   EVT VT = SN->getMemoryVT();
    2766             :   unsigned Size = VT.getStoreSize();
    2767             : 
    2768             :   SDLoc SL(N);
    2769       43663 :   SelectionDAG &DAG = DCI.DAG;
    2770       43663 :   unsigned Align = SN->getAlignment();
    2771       43663 :   if (Align < Size && isTypeLegal(VT)) {
    2772             :     bool IsFast;
    2773             :     unsigned AS = SN->getAddressSpace();
    2774             : 
    2775             :     // Expand unaligned stores earlier than legalization. Due to visitation
    2776             :     // order problems during legalization, the emitted instructions to pack and
    2777             :     // unpack the bytes again are not eliminated in the case of an unaligned
    2778             :     // copy.
    2779        4019 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2780         308 :       if (VT.isVector())
    2781         378 :         return scalarizeVectorStore(SN, DAG);
    2782             : 
    2783         238 :       return expandUnalignedStore(SN, DAG);
    2784             :     }
    2785             : 
    2786        3711 :     if (!IsFast)
    2787           0 :       return SDValue();
    2788             :   }
    2789             : 
    2790       43355 :   if (!shouldCombineMemoryType(VT))
    2791       40977 :     return SDValue();
    2792             : 
    2793        2378 :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2794        2378 :   SDValue Val = SN->getValue();
    2795             : 
    2796             :   //DCI.AddToWorklist(Val.getNode());
    2797             : 
    2798             :   bool OtherUses = !Val.hasOneUse();
    2799        2378 :   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
    2800        2378 :   if (OtherUses) {
    2801          31 :     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
    2802          31 :     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
    2803             :   }
    2804             : 
    2805             :   return DAG.getStore(SN->getChain(), SL, CastVal,
    2806        4756 :                       SN->getBasePtr(), SN->getMemOperand());
    2807             : }
    2808             : 
    2809         324 : SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
    2810             :                                                   DAGCombinerInfo &DCI) const {
    2811         324 :   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
    2812             :   if (!CSrc)
    2813         300 :     return SDValue();
    2814             : 
    2815          24 :   const APFloat &F = CSrc->getValueAPF();
    2816          24 :   APFloat Zero = APFloat::getZero(F.getSemantics());
    2817          24 :   APFloat::cmpResult Cmp0 = F.compare(Zero);
    2818          24 :   if (Cmp0 == APFloat::cmpLessThan ||
    2819          12 :       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
    2820          27 :     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
    2821             :   }
    2822             : 
    2823          15 :   APFloat One(F.getSemantics(), "1.0");
    2824          15 :   APFloat::cmpResult Cmp1 = F.compare(One);
    2825          15 :   if (Cmp1 == APFloat::cmpGreaterThan)
    2826           9 :     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
    2827             : 
    2828          12 :   return SDValue(CSrc, 0);
    2829             : }
    2830             : 
    2831             : // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
    2832             : // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
    2833             : // issues.
    2834       10364 : SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
    2835             :                                                         DAGCombinerInfo &DCI) const {
    2836       10364 :   SelectionDAG &DAG = DCI.DAG;
    2837       10364 :   SDValue N0 = N->getOperand(0);
    2838             : 
    2839             :   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
    2840             :   //     (vt2 (truncate (assertzext vt0:x, vt1)))
    2841       10364 :   if (N0.getOpcode() == ISD::TRUNCATE) {
    2842          21 :     SDValue N1 = N->getOperand(1);
    2843          21 :     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
    2844             :     SDLoc SL(N);
    2845             : 
    2846          21 :     SDValue Src = N0.getOperand(0);
    2847          21 :     EVT SrcVT = Src.getValueType();
    2848          21 :     if (SrcVT.bitsGE(ExtVT)) {
    2849          42 :       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
    2850          42 :       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
    2851             :     }
    2852             :   }
    2853             : 
    2854       10343 :   return SDValue();
    2855             : }
    2856             : /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
    2857             : /// binary operation \p Opc to it with the corresponding constant operands.
    2858        1302 : SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
    2859             :   DAGCombinerInfo &DCI, const SDLoc &SL,
    2860             :   unsigned Opc, SDValue LHS,
    2861             :   uint32_t ValLo, uint32_t ValHi) const {
    2862        1302 :   SelectionDAG &DAG = DCI.DAG;
    2863             :   SDValue Lo, Hi;
    2864        2604 :   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
    2865             : 
    2866        1302 :   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
    2867        1302 :   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
    2868             : 
    2869        1302 :   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
    2870        1302 :   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
    2871             : 
    2872             :   // Re-visit the ands. It's possible we eliminated one of them and it could
    2873             :   // simplify the vector.
    2874        1302 :   DCI.AddToWorklist(Lo.getNode());
    2875        1302 :   DCI.AddToWorklist(Hi.getNode());
    2876             : 
    2877        2604 :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
    2878        1302 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2879             : }
    2880             : 
    2881       18146 : SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
    2882             :                                                 DAGCombinerInfo &DCI) const {
    2883       36292 :   EVT VT = N->getValueType(0);
    2884             : 
    2885       18146 :   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2886             :   if (!RHS)
    2887        3571 :     return SDValue();
    2888             : 
    2889       14575 :   SDValue LHS = N->getOperand(0);
    2890       29150 :   unsigned RHSVal = RHS->getZExtValue();
    2891       14575 :   if (!RHSVal)
    2892           0 :     return LHS;
    2893             : 
    2894             :   SDLoc SL(N);
    2895       14575 :   SelectionDAG &DAG = DCI.DAG;
    2896             : 
    2897       29150 :   switch (LHS->getOpcode()) {
    2898             :   default:
    2899             :     break;
    2900        4348 :   case ISD::ZERO_EXTEND:
    2901             :   case ISD::SIGN_EXTEND:
    2902             :   case ISD::ANY_EXTEND: {
    2903        4348 :     SDValue X = LHS->getOperand(0);
    2904             : 
    2905         531 :     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
    2906             :         isTypeLegal(MVT::v2i16)) {
    2907             :       // Prefer build_vector as the canonical form if packed types are legal.
    2908             :       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
    2909             :       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
    2910          26 :        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
    2911          13 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    2912             :     }
    2913             : 
    2914             :     // shl (ext x) => zext (shl x), if shift does not overflow int
    2915             :     if (VT != MVT::i64)
    2916             :       break;
    2917             :     KnownBits Known;
    2918        3753 :     DAG.computeKnownBits(X, Known);
    2919             :     unsigned LZ = Known.countMinLeadingZeros();
    2920        3753 :     if (LZ < RHSVal)
    2921             :       break;
    2922        2540 :     EVT XVT = X.getValueType();
    2923        2540 :     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
    2924        2540 :     return DAG.getZExtOrTrunc(Shl, SL, VT);
    2925             :   }
    2926             :   }
    2927             : 
    2928             :   if (VT != MVT::i64)
    2929       10040 :     return SDValue();
    2930             : 
    2931             :   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
    2932             : 
    2933             :   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
    2934             :   // common case, splitting this into a move and a 32-bit shift is faster and
    2935             :   // the same code size.
    2936        1982 :   if (RHSVal < 32)
    2937         986 :     return SDValue();
    2938             : 
    2939         996 :   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
    2940             : 
    2941         996 :   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
    2942         996 :   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
    2943             : 
    2944         996 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2945             : 
    2946        1992 :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
    2947         996 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2948             : }
    2949             : 
    2950        5588 : SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
    2951             :                                                 DAGCombinerInfo &DCI) const {
    2952        5588 :   if (N->getValueType(0) != MVT::i64)
    2953        5125 :     return SDValue();
    2954             : 
    2955         463 :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2956             :   if (!RHS)
    2957          20 :     return SDValue();
    2958             : 
    2959         443 :   SelectionDAG &DAG = DCI.DAG;
    2960             :   SDLoc SL(N);
    2961         886 :   unsigned RHSVal = RHS->getZExtValue();
    2962             : 
    2963             :   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
    2964         443 :   if (RHSVal == 32) {
    2965          12 :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    2966             :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    2967          12 :                                    DAG.getConstant(31, SL, MVT::i32));
    2968             : 
    2969          12 :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
    2970           6 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    2971             :   }
    2972             : 
    2973             :   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
    2974         437 :   if (RHSVal == 63) {
    2975         252 :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    2976             :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    2977         252 :                                    DAG.getConstant(31, SL, MVT::i32));
    2978         252 :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
    2979         126 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    2980             :   }
    2981             : 
    2982         311 :   return SDValue();
    2983             : }
    2984             : 
    2985       54607 : SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
    2986             :                                                 DAGCombinerInfo &DCI) const {
    2987       54607 :   if (N->getValueType(0) != MVT::i64)
    2988       45244 :     return SDValue();
    2989             : 
    2990        9363 :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2991             :   if (!RHS)
    2992         372 :     return SDValue();
    2993             : 
    2994       17982 :   unsigned ShiftAmt = RHS->getZExtValue();
    2995        8991 :   if (ShiftAmt < 32)
    2996         155 :     return SDValue();
    2997             : 
    2998             :   // srl i64:x, C for C >= 32
    2999             :   // =>
    3000             :   //   build_pair (srl hi_32(x), C - 32), 0
    3001             : 
    3002        8836 :   SelectionDAG &DAG = DCI.DAG;
    3003             :   SDLoc SL(N);
    3004             : 
    3005        8836 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    3006        8836 :   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    3007             : 
    3008       17672 :   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
    3009             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
    3010        8836 :                            VecOp, One);
    3011             : 
    3012        8836 :   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
    3013        8836 :   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
    3014             : 
    3015       17672 :   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
    3016             : 
    3017        8836 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
    3018             : }
    3019             : 
    3020             : // We need to specifically handle i64 mul here to avoid unnecessary conversion
    3021             : // instructions. If we only match on the legalized i64 mul expansion,
    3022             : // SimplifyDemandedBits will be unable to remove them because there will be
    3023             : // multiple uses due to the separate mul + mulh[su].
    3024         401 : static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
    3025             :                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
    3026         401 :   if (Size <= 32) {
    3027         367 :     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3028         367 :     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
    3029             :   }
    3030             : 
    3031             :   // Because we want to eliminate extension instructions before the
    3032             :   // operation, we need to create a single user here (i.e. not the separate
    3033             :   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
    3034             : 
    3035          34 :   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
    3036             : 
    3037             :   SDValue Mul = DAG.getNode(MulOpc, SL,
    3038          34 :                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
    3039             : 
    3040             :   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
    3041          34 :                      Mul.getValue(0), Mul.getValue(1));
    3042             : }
    3043             : 
    3044        5007 : SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
    3045             :                                                 DAGCombinerInfo &DCI) const {
    3046       10014 :   EVT VT = N->getValueType(0);
    3047             : 
    3048        5007 :   unsigned Size = VT.getSizeInBits();
    3049        5007 :   if (VT.isVector() || Size > 64)
    3050         130 :     return SDValue();
    3051             : 
    3052             :   // There are i16 integer mul/mad.
    3053        9524 :   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
    3054          51 :     return SDValue();
    3055             : 
    3056        4826 :   SelectionDAG &DAG = DCI.DAG;
    3057             :   SDLoc DL(N);
    3058             : 
    3059        4826 :   SDValue N0 = N->getOperand(0);
    3060        4826 :   SDValue N1 = N->getOperand(1);
    3061        4826 :   SDValue Mul;
    3062             : 
    3063       14925 :   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
    3064         372 :     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3065         372 :     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3066         372 :     Mul = getMul24(DAG, DL, N0, N1, Size, false);
    3067        8533 :   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
    3068          29 :     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3069          29 :     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3070          29 :     Mul = getMul24(DAG, DL, N0, N1, Size, true);
    3071             :   } else {
    3072        4425 :     return SDValue();
    3073             :   }
    3074             : 
    3075             :   // We need to use sext even for MUL_U24, because MUL_U24 is used
    3076             :   // for signed multiply of 8 and 16-bit types.
    3077         401 :   return DAG.getSExtOrTrunc(Mul, DL, VT);
    3078             : }
    3079             : 
    3080         100 : SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
    3081             :                                                   DAGCombinerInfo &DCI) const {
    3082         200 :   EVT VT = N->getValueType(0);
    3083             : 
    3084         172 :   if (!Subtarget->hasMulI24() || VT.isVector())
    3085          28 :     return SDValue();
    3086             : 
    3087          72 :   SelectionDAG &DAG = DCI.DAG;
    3088             :   SDLoc DL(N);
    3089             : 
    3090          72 :   SDValue N0 = N->getOperand(0);
    3091          72 :   SDValue N1 = N->getOperand(1);
    3092             : 
    3093          72 :   if (!isI24(N0, DAG) || !isI24(N1, DAG))
    3094          72 :     return SDValue();
    3095             : 
    3096           0 :   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3097           0 :   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3098             : 
    3099           0 :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
    3100           0 :   DCI.AddToWorklist(Mulhi.getNode());
    3101           0 :   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
    3102             : }
    3103             : 
    3104        3330 : SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
    3105             :                                                   DAGCombinerInfo &DCI) const {
    3106        6660 :   EVT VT = N->getValueType(0);
    3107             : 
    3108        9990 :   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
    3109           0 :     return SDValue();
    3110             : 
    3111        3330 :   SelectionDAG &DAG = DCI.DAG;
    3112             :   SDLoc DL(N);
    3113             : 
    3114        3330 :   SDValue N0 = N->getOperand(0);
    3115        3330 :   SDValue N1 = N->getOperand(1);
    3116             : 
    3117        3330 :   if (!isU24(N0, DAG) || !isU24(N1, DAG))
    3118        3330 :     return SDValue();
    3119             : 
    3120           0 :   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3121           0 :   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3122             : 
    3123           0 :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
    3124           0 :   DCI.AddToWorklist(Mulhi.getNode());
    3125           0 :   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
    3126             : }
    3127             : 
    3128         122 : SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
    3129             :   SDNode *N, DAGCombinerInfo &DCI) const {
    3130         122 :   SelectionDAG &DAG = DCI.DAG;
    3131             : 
    3132             :   // Simplify demanded bits before splitting into multiple users.
    3133         122 :   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
    3134          88 :     return SDValue();
    3135             : 
    3136          34 :   SDValue N0 = N->getOperand(0);
    3137          34 :   SDValue N1 = N->getOperand(1);
    3138             : 
    3139          34 :   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
    3140             : 
    3141          34 :   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3142          34 :   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
    3143             : 
    3144             :   SDLoc SL(N);
    3145             : 
    3146          34 :   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
    3147          34 :   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
    3148          68 :   return DAG.getMergeValues({ MulLo, MulHi }, SL);
    3149             : }
    3150             : 
    3151          25 : static bool isNegativeOne(SDValue Val) {
    3152             :   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
    3153          25 :     return C->isAllOnesValue();
    3154             :   return false;
    3155             : }
    3156             : 
    3157          21 : SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
    3158             :                                           SDValue Op,
    3159             :                                           const SDLoc &DL,
    3160             :                                           unsigned Opc) const {
    3161          21 :   EVT VT = Op.getValueType();
    3162          21 :   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
    3163           3 :   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
    3164             :                               LegalVT != MVT::i16))
    3165           0 :     return SDValue();
    3166             : 
    3167             :   if (VT != MVT::i32)
    3168          11 :     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
    3169             : 
    3170          21 :   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
    3171             :   if (VT != MVT::i32)
    3172          11 :     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
    3173             : 
    3174          21 :   return FFBX;
    3175             : }
    3176             : 
    3177             : // The native instructions return -1 on 0 input. Optimize out a select that
    3178             : // produces -1 on 0.
    3179             : //
    3180             : // TODO: If zero is not undef, we could also do this if the output is compared
    3181             : // against the bitwidth.
    3182             : //
    3183             : // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
    3184        8008 : SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
    3185             :                                                  SDValue LHS, SDValue RHS,
    3186             :                                                  DAGCombinerInfo &DCI) const {
    3187             :   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
    3188       11204 :   if (!CmpRhs || !CmpRhs->isNullValue())
    3189        3880 :     return SDValue();
    3190             : 
    3191        4128 :   SelectionDAG &DAG = DCI.DAG;
    3192        4128 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
    3193        4128 :   SDValue CmpLHS = Cond.getOperand(0);
    3194             : 
    3195        4128 :   unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
    3196             :                                            AMDGPUISD::FFBH_U32;
    3197             : 
    3198             :   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
    3199             :   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
    3200        1896 :   if (CCOpcode == ISD::SETEQ &&
    3201        1894 :       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3202        4130 :       RHS.getOperand(0) == CmpLHS &&
    3203           2 :       isNegativeOne(LHS)) {
    3204           2 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3205             :   }
    3206             : 
    3207             :   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
    3208             :   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
    3209         857 :   if (CCOpcode == ISD::SETNE &&
    3210         834 :       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3211        4149 :       LHS.getOperand(0) == CmpLHS &&
    3212          23 :       isNegativeOne(RHS)) {
    3213          19 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3214             :   }
    3215             : 
    3216        4107 :   return SDValue();
    3217             : }
    3218             : 
    3219          24 : static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
    3220             :                                          unsigned Op,
    3221             :                                          const SDLoc &SL,
    3222             :                                          SDValue Cond,
    3223             :                                          SDValue N1,
    3224             :                                          SDValue N2) {
    3225          24 :   SelectionDAG &DAG = DCI.DAG;
    3226          48 :   EVT VT = N1.getValueType();
    3227             : 
    3228             :   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
    3229          48 :                                   N1.getOperand(0), N2.getOperand(0));
    3230          24 :   DCI.AddToWorklist(NewSelect.getNode());
    3231          24 :   return DAG.getNode(Op, SL, VT, NewSelect);
    3232             : }
    3233             : 
    3234             : // Pull a free FP operation out of a select so it may fold into uses.
    3235             : //
    3236             : // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
    3237             : // select c, (fneg x), k -> fneg (select c, x, (fneg k))
    3238             : //
    3239             : // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
    3240             : // select c, (fabs x), +k -> fabs (select c, x, k)
    3241        9547 : static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
    3242             :                                     SDValue N) {
    3243        9547 :   SelectionDAG &DAG = DCI.DAG;
    3244        9547 :   SDValue Cond = N.getOperand(0);
    3245        9547 :   SDValue LHS = N.getOperand(1);
    3246        9547 :   SDValue RHS = N.getOperand(2);
    3247             : 
    3248        9547 :   EVT VT = N.getValueType();
    3249        9595 :   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
    3250         104 :       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
    3251             :     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
    3252          48 :                                      SDLoc(N), Cond, LHS, RHS);
    3253             :   }
    3254             : 
    3255             :   bool Inv = false;
    3256       19020 :   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
    3257             :     std::swap(LHS, RHS);
    3258             :     Inv = true;
    3259             :   }
    3260             : 
    3261             :   // TODO: Support vector constants.
    3262             :   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    3263        9523 :   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
    3264             :     SDLoc SL(N);
    3265             :     // If one side is an fneg/fabs and the other is a constant, we can push the
    3266             :     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
    3267          78 :     SDValue NewLHS = LHS.getOperand(0);
    3268          78 :     SDValue NewRHS = RHS;
    3269             : 
    3270             :     // Careful: if the neg can be folded up, don't try to pull it back down.
    3271             :     bool ShouldFoldNeg = true;
    3272             : 
    3273          78 :     if (NewLHS.hasOneUse()) {
    3274             :       unsigned Opc = NewLHS.getOpcode();
    3275          68 :       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
    3276             :         ShouldFoldNeg = false;
    3277          68 :       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
    3278             :         ShouldFoldNeg = false;
    3279             :     }
    3280             : 
    3281          68 :     if (ShouldFoldNeg) {
    3282          68 :       if (LHS.getOpcode() == ISD::FNEG)
    3283          34 :         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3284          68 :       else if (CRHS->isNegative())
    3285          26 :         return SDValue();
    3286             : 
    3287          42 :       if (Inv)
    3288             :         std::swap(NewLHS, NewRHS);
    3289             : 
    3290             :       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
    3291          42 :                                       Cond, NewLHS, NewRHS);
    3292          42 :       DCI.AddToWorklist(NewSelect.getNode());
    3293          42 :       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
    3294             :     }
    3295             :   }
    3296             : 
    3297        9455 :   return SDValue();
    3298             : }
    3299             : 
    3300             : 
    3301        9547 : SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
    3302             :                                                    DAGCombinerInfo &DCI) const {
    3303        9547 :   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
    3304          66 :     return Folded;
    3305             : 
    3306        9481 :   SDValue Cond = N->getOperand(0);
    3307        9481 :   if (Cond.getOpcode() != ISD::SETCC)
    3308         858 :     return SDValue();
    3309             : 
    3310       17246 :   EVT VT = N->getValueType(0);
    3311        8623 :   SDValue LHS = Cond.getOperand(0);
    3312        8623 :   SDValue RHS = Cond.getOperand(1);
    3313        8623 :   SDValue CC = Cond.getOperand(2);
    3314             : 
    3315        8623 :   SDValue True = N->getOperand(1);
    3316        8623 :   SDValue False = N->getOperand(2);
    3317             : 
    3318        8623 :   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
    3319        3996 :     SelectionDAG &DAG = DCI.DAG;
    3320        6493 :     if ((DAG.isConstantValueOfAnyType(True) ||
    3321        5495 :          DAG.isConstantValueOfAnyType(True)) &&
    3322        1752 :         (!DAG.isConstantValueOfAnyType(False) &&
    3323         253 :          !DAG.isConstantValueOfAnyType(False))) {
    3324             :       // Swap cmp + select pair to move constant to false input.
    3325             :       // This will allow using VOPC cndmasks more often.
    3326             :       // select (setcc x, y), k, x -> select (setcc y, x) x, x
    3327             : 
    3328             :       SDLoc SL(N);
    3329         253 :       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
    3330         506 :                                             LHS.getValueType().isInteger());
    3331             : 
    3332         253 :       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
    3333         253 :       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
    3334             :     }
    3335             : 
    3336        1248 :     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
    3337             :       SDValue MinMax
    3338         724 :         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
    3339             :       // Revisit this node so we can catch min3/max3/med3 patterns.
    3340             :       //DCI.AddToWorklist(MinMax.getNode());
    3341         362 :       return MinMax;
    3342             :     }
    3343             :   }
    3344             : 
    3345             :   // There's no reason to not do this if the condition has other uses.
    3346       16016 :   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
    3347             : }
    3348             : 
    3349          63 : static bool isConstantFPZero(SDValue N) {
    3350          63 :   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
    3351         113 :     return C->isZero() && !C->isNegative();
    3352             :   return false;
    3353             : }
    3354             : 
    3355             : static unsigned inverseMinMax(unsigned Opc) {
    3356          46 :   switch (Opc) {
    3357             :   case ISD::FMAXNUM:
    3358             :     return ISD::FMINNUM;
    3359          21 :   case ISD::FMINNUM:
    3360             :     return ISD::FMAXNUM;
    3361           2 :   case AMDGPUISD::FMAX_LEGACY:
    3362             :     return AMDGPUISD::FMIN_LEGACY;
    3363           3 :   case AMDGPUISD::FMIN_LEGACY:
    3364             :     return  AMDGPUISD::FMAX_LEGACY;
    3365           0 :   default:
    3366           0 :     llvm_unreachable("invalid min/max opcode");
    3367             :   }
    3368             : }
    3369             : 
    3370        2993 : SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
    3371             :                                                  DAGCombinerInfo &DCI) const {
    3372        2993 :   SelectionDAG &DAG = DCI.DAG;
    3373        2993 :   SDValue N0 = N->getOperand(0);
    3374        5986 :   EVT VT = N->getValueType(0);
    3375             : 
    3376             :   unsigned Opc = N0.getOpcode();
    3377             : 
    3378             :   // If the input has multiple uses and we can either fold the negate down, or
    3379             :   // the other uses cannot, give up. This both prevents unprofitable
    3380             :   // transformations and infinite loops: we won't repeatedly try to fold around
    3381             :   // a negate that has no 'good' form.
    3382        2993 :   if (N0.hasOneUse()) {
    3383             :     // This may be able to fold into the source, but at a code size cost. Don't
    3384             :     // fold if the fold into the user is free.
    3385        2277 :     if (allUsesHaveSourceMods(N, 0))
    3386         792 :       return SDValue();
    3387             :   } else {
    3388         921 :     if (fnegFoldsIntoOp(Opc) &&
    3389         248 :         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
    3390         184 :       return SDValue();
    3391             :   }
    3392             : 
    3393             :   SDLoc SL(N);
    3394        2017 :   switch (Opc) {
    3395         100 :   case ISD::FADD: {
    3396          81 :     if (!mayIgnoreSignedZero(N0))
    3397          86 :       return SDValue();
    3398             : 
    3399             :     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
    3400          14 :     SDValue LHS = N0.getOperand(0);
    3401          14 :     SDValue RHS = N0.getOperand(1);
    3402             : 
    3403          14 :     if (LHS.getOpcode() != ISD::FNEG)
    3404          10 :       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3405             :     else
    3406           4 :       LHS = LHS.getOperand(0);
    3407             : 
    3408          14 :     if (RHS.getOpcode() != ISD::FNEG)
    3409          12 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3410             :     else
    3411           2 :       RHS = RHS.getOperand(0);
    3412             : 
    3413          14 :     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
    3414          14 :     if (!N0.hasOneUse())
    3415           2 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3416          14 :     return Res;
    3417             :   }
    3418          47 :   case ISD::FMUL:
    3419             :   case AMDGPUISD::FMUL_LEGACY: {
    3420             :     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
    3421             :     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
    3422          47 :     SDValue LHS = N0.getOperand(0);
    3423          47 :     SDValue RHS = N0.getOperand(1);
    3424             : 
    3425          47 :     if (LHS.getOpcode() == ISD::FNEG)
    3426           8 :       LHS = LHS.getOperand(0);
    3427          39 :     else if (RHS.getOpcode() == ISD::FNEG)
    3428           2 :       RHS = RHS.getOperand(0);
    3429             :     else
    3430          37 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3431             : 
    3432          47 :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
    3433          47 :     if (!N0.hasOneUse())
    3434           5 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3435          47 :     return Res;
    3436             :   }
    3437          47 :   case ISD::FMA:
    3438             :   case ISD::FMAD: {
    3439           0 :     if (!mayIgnoreSignedZero(N0))
    3440          35 :       return SDValue();
    3441             : 
    3442             :     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
    3443          12 :     SDValue LHS = N0.getOperand(0);
    3444          12 :     SDValue MHS = N0.getOperand(1);
    3445          12 :     SDValue RHS = N0.getOperand(2);
    3446             : 
    3447          12 :     if (LHS.getOpcode() == ISD::FNEG)
    3448           5 :       LHS = LHS.getOperand(0);
    3449           7 :     else if (MHS.getOpcode() == ISD::FNEG)
    3450           1 :       MHS = MHS.getOperand(0);
    3451             :     else
    3452           6 :       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
    3453             : 
    3454          12 :     if (RHS.getOpcode() != ISD::FNEG)
    3455          10 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3456             :     else
    3457           2 :       RHS = RHS.getOperand(0);
    3458             : 
    3459          12 :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
    3460          12 :     if (!N0.hasOneUse())
    3461           1 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3462          12 :     return Res;
    3463             :   }
    3464          63 :   case ISD::FMAXNUM:
    3465             :   case ISD::FMINNUM:
    3466             :   case AMDGPUISD::FMAX_LEGACY:
    3467             :   case AMDGPUISD::FMIN_LEGACY: {
    3468             :     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
    3469             :     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
    3470             :     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
    3471             :     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
    3472             : 
    3473          63 :     SDValue LHS = N0.getOperand(0);
    3474          63 :     SDValue RHS = N0.getOperand(1);
    3475             : 
    3476             :     // 0 doesn't have a negated inline immediate.
    3477             :     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
    3478             :     // operations.
    3479          63 :     if (isConstantFPZero(RHS))
    3480          17 :       return SDValue();
    3481             : 
    3482          46 :     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3483          46 :     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3484             :     unsigned Opposite = inverseMinMax(Opc);
    3485             : 
    3486          46 :     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
    3487          46 :     if (!N0.hasOneUse())
    3488           4 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3489          46 :     return Res;
    3490             :   }
    3491          42 :   case ISD::FP_EXTEND:
    3492             :   case ISD::FTRUNC:
    3493             :   case ISD::FRINT:
    3494             :   case ISD::FNEARBYINT: // XXX - Should fround be handled?
    3495             :   case ISD::FSIN:
    3496             :   case AMDGPUISD::RCP:
    3497             :   case AMDGPUISD::RCP_LEGACY:
    3498             :   case AMDGPUISD::SIN_HW: {
    3499          42 :     SDValue CvtSrc = N0.getOperand(0);
    3500          42 :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3501             :       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
    3502             :       // (fneg (rcp (fneg x))) -> (rcp x)
    3503           6 :       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
    3504             :     }
    3505             : 
    3506          36 :     if (!N0.hasOneUse())
    3507          14 :       return SDValue();
    3508             : 
    3509             :     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
    3510             :     // (fneg (rcp x)) -> (rcp (fneg x))
    3511          22 :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3512          22 :     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
    3513             :   }
    3514           9 :   case ISD::FP_ROUND: {
    3515           9 :     SDValue CvtSrc = N0.getOperand(0);
    3516             : 
    3517           9 :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3518             :       // (fneg (fp_round (fneg x))) -> (fp_round x)
    3519             :       return DAG.getNode(ISD::FP_ROUND, SL, VT,
    3520           0 :                          CvtSrc.getOperand(0), N0.getOperand(1));
    3521             :     }
    3522             : 
    3523           9 :     if (!N0.hasOneUse())
    3524           4 :       return SDValue();
    3525             : 
    3526             :     // (fneg (fp_round x)) -> (fp_round (fneg x))
    3527           5 :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3528           5 :     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
    3529             :   }
    3530             :   case ISD::FP16_TO_FP: {
    3531             :     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
    3532             :     // f16, but legalization of f16 fneg ends up pulling it out of the source.
    3533             :     // Put the fneg back as a legal source operation that can be matched later.
    3534             :     SDLoc SL(N);
    3535             : 
    3536          39 :     SDValue Src = N0.getOperand(0);
    3537          39 :     EVT SrcVT = Src.getValueType();
    3538             : 
    3539             :     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
    3540             :     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
    3541          39 :                                   DAG.getConstant(0x8000, SL, SrcVT));
    3542          78 :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
    3543             :   }
    3544        1670 :   default:
    3545        1670 :     return SDValue();
    3546             :   }
    3547             : }
    3548             : 
    3549        1789 : SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
    3550             :                                                  DAGCombinerInfo &DCI) const {
    3551        1789 :   SelectionDAG &DAG = DCI.DAG;
    3552        1789 :   SDValue N0 = N->getOperand(0);
    3553             : 
    3554        1789 :   if (!N0.hasOneUse())
    3555         262 :     return SDValue();
    3556             : 
    3557        1527 :   switch (N0.getOpcode()) {
    3558             :   case ISD::FP16_TO_FP: {
    3559             :     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
    3560             :     SDLoc SL(N);
    3561          34 :     SDValue Src = N0.getOperand(0);
    3562          34 :     EVT SrcVT = Src.getValueType();
    3563             : 
    3564             :     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
    3565             :     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
    3566          34 :                                   DAG.getConstant(0x7fff, SL, SrcVT));
    3567          68 :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
    3568             :   }
    3569        1493 :   default:
    3570        1493 :     return SDValue();
    3571             :   }
    3572             : }
    3573             : 
    3574      609940 : SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
    3575             :                                                 DAGCombinerInfo &DCI) const {
    3576      609940 :   SelectionDAG &DAG = DCI.DAG;
    3577             :   SDLoc DL(N);
    3578             : 
    3579     1219880 :   switch(N->getOpcode()) {
    3580             :   default:
    3581             :     break;
    3582       87519 :   case ISD::BITCAST: {
    3583      175038 :     EVT DestVT = N->getValueType(0);
    3584             : 
    3585             :     // Push casts through vector builds. This helps avoid emitting a large
    3586             :     // number of copies when materializing floating point vector constants.
    3587             :     //
    3588             :     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
    3589             :     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
    3590       87519 :     if (DestVT.isVector()) {
    3591       24291 :       SDValue Src = N->getOperand(0);
    3592       24291 :       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
    3593        2729 :         EVT SrcVT = Src.getValueType();
    3594        2729 :         unsigned NElts = DestVT.getVectorNumElements();
    3595             : 
    3596        2729 :         if (SrcVT.getVectorNumElements() == NElts) {
    3597        1773 :           EVT DestEltVT = DestVT.getVectorElementType();
    3598             : 
    3599             :           SmallVector<SDValue, 8> CastedElts;
    3600             :           SDLoc SL(N);
    3601       11429 :           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
    3602        9656 :             SDValue Elt = Src.getOperand(I);
    3603        9656 :             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
    3604             :           }
    3605             : 
    3606        1773 :           return DAG.getBuildVector(DestVT, SL, CastedElts);
    3607             :         }
    3608             :       }
    3609             :     }
    3610             : 
    3611      123948 :     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
    3612             :       break;
    3613             : 
    3614             :     // Fold bitcasts of constants.
    3615             :     //
    3616             :     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
    3617             :     // TODO: Generalize and move to DAGCombiner
    3618       56744 :     SDValue Src = N->getOperand(0);
    3619             :     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
    3620             :       assert(Src.getValueType() == MVT::i64);
    3621             :       SDLoc SL(N);
    3622         332 :       uint64_t CVal = C->getZExtValue();
    3623             :       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
    3624             :                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3625         664 :                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3626             :     }
    3627             : 
    3628             :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
    3629          32 :       const APInt &Val = C->getValueAPF().bitcastToAPInt();
    3630             :       SDLoc SL(N);
    3631             :       uint64_t CVal = Val.getZExtValue();
    3632             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    3633             :                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3634          48 :                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3635             : 
    3636          16 :       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
    3637             :     }
    3638             : 
    3639             :     break;
    3640             :   }
    3641       30693 :   case ISD::SHL: {
    3642       30693 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3643             :       break;
    3644             : 
    3645       18146 :     return performShlCombine(N, DCI);
    3646             :   }
    3647       83471 :   case ISD::SRL: {
    3648       83471 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3649             :       break;
    3650             : 
    3651       54607 :     return performSrlCombine(N, DCI);
    3652             :   }
    3653       12192 :   case ISD::SRA: {
    3654       12192 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3655             :       break;
    3656             : 
    3657        5588 :     return performSraCombine(N, DCI);
    3658             :   }
    3659        5007 :   case ISD::MUL:
    3660        5007 :     return performMulCombine(N, DCI);
    3661         100 :   case ISD::MULHS:
    3662         100 :     return performMulhsCombine(N, DCI);
    3663        3330 :   case ISD::MULHU:
    3664        3330 :     return performMulhuCombine(N, DCI);
    3665        1453 :   case AMDGPUISD::MUL_I24:
    3666             :   case AMDGPUISD::MUL_U24:
    3667             :   case AMDGPUISD::MULHI_I24:
    3668             :   case AMDGPUISD::MULHI_U24: {
    3669             :     // If the first call to simplify is successfull, then N may end up being
    3670             :     // deleted, so we shouldn't call simplifyI24 again.
    3671        1453 :     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
    3672        1453 :     return SDValue();
    3673             :   }
    3674         122 :   case AMDGPUISD::MUL_LOHI_I24:
    3675             :   case AMDGPUISD::MUL_LOHI_U24:
    3676         122 :     return performMulLoHi24Combine(N, DCI);
    3677        9547 :   case ISD::SELECT:
    3678        9547 :     return performSelectCombine(N, DCI);
    3679        2993 :   case ISD::FNEG:
    3680        2993 :     return performFNegCombine(N, DCI);
    3681        1789 :   case ISD::FABS:
    3682        1789 :     return performFAbsCombine(N, DCI);
    3683         416 :   case AMDGPUISD::BFE_I32:
    3684             :   case AMDGPUISD::BFE_U32: {
    3685             :     assert(!N->getValueType(0).isVector() &&
    3686             :            "Vector handling of BFE not implemented");
    3687         416 :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
    3688             :     if (!Width)
    3689             :       break;
    3690             : 
    3691         808 :     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
    3692         404 :     if (WidthVal == 0)
    3693          22 :       return DAG.getConstant(0, DL, MVT::i32);
    3694             : 
    3695             :     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
    3696             :     if (!Offset)
    3697             :       break;
    3698             : 
    3699         378 :     SDValue BitsFrom = N->getOperand(0);
    3700         756 :     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
    3701             : 
    3702             :     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
    3703             : 
    3704         378 :     if (OffsetVal == 0) {
    3705             :       // This is already sign / zero extended, so try to fold away extra BFEs.
    3706          58 :       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
    3707             : 
    3708          58 :       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
    3709          58 :       if (OpSignBits >= SignBits)
    3710          34 :         return BitsFrom;
    3711             : 
    3712          24 :       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
    3713          24 :       if (Signed) {
    3714             :         // This is a sign_extend_inreg. Replace it to take advantage of existing
    3715             :         // DAG Combines. If not eliminated, we will match back to BFE during
    3716             :         // selection.
    3717             : 
    3718             :         // TODO: The sext_inreg of extended types ends, although we can could
    3719             :         // handle them in a single BFE.
    3720             :         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
    3721          36 :                            DAG.getValueType(SmallVT));
    3722             :       }
    3723             : 
    3724           6 :       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
    3725             :     }
    3726             : 
    3727             :     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
    3728          48 :       if (Signed) {
    3729             :         return constantFoldBFE<int32_t>(DAG,
    3730          24 :                                         CVal->getSExtValue(),
    3731             :                                         OffsetVal,
    3732             :                                         WidthVal,
    3733          24 :                                         DL);
    3734             :       }
    3735             : 
    3736             :       return constantFoldBFE<uint32_t>(DAG,
    3737          24 :                                        CVal->getZExtValue(),
    3738             :                                        OffsetVal,
    3739             :                                        WidthVal,
    3740          24 :                                        DL);
    3741             :     }
    3742             : 
    3743         356 :     if ((OffsetVal + WidthVal) >= 32 &&
    3744         151 :         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
    3745          82 :       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
    3746             :       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
    3747          82 :                          BitsFrom, ShiftVal);
    3748             :     }
    3749             : 
    3750         190 :     if (BitsFrom.hasOneUse()) {
    3751             :       APInt Demanded = APInt::getBitsSet(32,
    3752             :                                          OffsetVal,
    3753             :                                          OffsetVal + WidthVal);
    3754             : 
    3755          48 :       KnownBits Known;
    3756          48 :       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    3757          96 :                                             !DCI.isBeforeLegalizeOps());
    3758          48 :       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    3759          88 :       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
    3760          40 :           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
    3761          14 :         DCI.CommitTargetLoweringOpt(TLO);
    3762             :       }
    3763             :     }
    3764             : 
    3765             :     break;
    3766             :   }
    3767      136368 :   case ISD::LOAD:
    3768      136368 :     return performLoadCombine(N, DCI);
    3769       90296 :   case ISD::STORE:
    3770       90296 :     return performStoreCombine(N, DCI);
    3771         324 :   case AMDGPUISD::CLAMP:
    3772         324 :     return performClampCombine(N, DCI);
    3773         454 :   case AMDGPUISD::RCP: {
    3774         454 :     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
    3775             :       // XXX - Should this flush denormals?
    3776          17 :       const APFloat &Val = CFP->getValueAPF();
    3777          17 :       APFloat One(Val.getSemantics(), "1.0");
    3778          68 :       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
    3779             :     }
    3780             : 
    3781             :     break;
    3782             :   }
    3783       10364 :   case ISD::AssertZext:
    3784             :   case ISD::AssertSext:
    3785       10364 :     return performAssertSZExtCombine(N, DCI);
    3786             :   }
    3787      267558 :   return SDValue();
    3788             : }
    3789             : 
    3790             : //===----------------------------------------------------------------------===//
    3791             : // Helper functions
    3792             : //===----------------------------------------------------------------------===//
    3793             : 
    3794        4746 : SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
    3795             :                                                    const TargetRegisterClass *RC,
    3796             :                                                    unsigned Reg, EVT VT,
    3797             :                                                    const SDLoc &SL,
    3798             :                                                    bool RawReg) const {
    3799        4746 :   MachineFunction &MF = DAG.getMachineFunction();
    3800        4746 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3801             :   unsigned VReg;
    3802             : 
    3803        4746 :   if (!MRI.isLiveIn(Reg)) {
    3804        1399 :     VReg = MRI.createVirtualRegister(RC);
    3805             :     MRI.addLiveIn(Reg, VReg);
    3806             :   } else {
    3807        3347 :     VReg = MRI.getLiveInVirtReg(Reg);
    3808             :   }
    3809             : 
    3810        4746 :   if (RawReg)
    3811         236 :     return DAG.getRegister(VReg, VT);
    3812             : 
    3813        4510 :   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
    3814             : }
    3815             : 
    3816           8 : SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
    3817             :                                                   EVT VT,
    3818             :                                                   const SDLoc &SL,
    3819             :                                                   int64_t Offset) const {
    3820           8 :   MachineFunction &MF = DAG.getMachineFunction();
    3821           8 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    3822             : 
    3823           8 :   int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
    3824           8 :   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
    3825           8 :   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
    3826             : 
    3827             :   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
    3828             :                      MachineMemOperand::MODereferenceable |
    3829           8 :                      MachineMemOperand::MOInvariant);
    3830             : }
    3831             : 
    3832          10 : SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
    3833             :                                                    const SDLoc &SL,
    3834             :                                                    SDValue Chain,
    3835             :                                                    SDValue StackPtr,
    3836             :                                                    SDValue ArgVal,
    3837             :                                                    int64_t Offset) const {
    3838          10 :   MachineFunction &MF = DAG.getMachineFunction();
    3839          10 :   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
    3840             : 
    3841          10 :   SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
    3842             :   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
    3843          10 :                                MachineMemOperand::MODereferenceable);
    3844          10 :   return Store;
    3845             : }
    3846             : 
    3847        3132 : SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
    3848             :                                              const TargetRegisterClass *RC,
    3849             :                                              EVT VT, const SDLoc &SL,
    3850             :                                              const ArgDescriptor &Arg) const {
    3851             :   assert(Arg && "Attempting to load missing argument");
    3852             : 
    3853        3132 :   if (Arg.isRegister())
    3854        3124 :     return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
    3855           8 :   return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
    3856             : }
    3857             : 
    3858          36 : uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
    3859             :     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
    3860          36 :   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
    3861          36 :   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
    3862          36 :   switch (Param) {
    3863          36 :   case GRID_DIM:
    3864          36 :     return ArgOffset;
    3865           0 :   case GRID_OFFSET:
    3866           0 :     return ArgOffset + 4;
    3867             :   }
    3868           0 :   llvm_unreachable("unexpected implicit parameter type");
    3869             : }
    3870             : 
    3871             : #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
    3872             : 
    3873           0 : const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    3874           0 :   switch ((AMDGPUISD::NodeType)Opcode) {
    3875             :   case AMDGPUISD::FIRST_NUMBER: break;
    3876             :   // AMDIL DAG nodes
    3877             :   NODE_NAME_CASE(UMUL);
    3878           0 :   NODE_NAME_CASE(BRANCH_COND);
    3879             : 
    3880             :   // AMDGPU DAG nodes
    3881           0 :   NODE_NAME_CASE(IF)
    3882           0 :   NODE_NAME_CASE(ELSE)
    3883           0 :   NODE_NAME_CASE(LOOP)
    3884           0 :   NODE_NAME_CASE(CALL)
    3885           0 :   NODE_NAME_CASE(TC_RETURN)
    3886           0 :   NODE_NAME_CASE(TRAP)
    3887           0 :   NODE_NAME_CASE(RET_FLAG)
    3888           0 :   NODE_NAME_CASE(RETURN_TO_EPILOG)
    3889           0 :   NODE_NAME_CASE(ENDPGM)
    3890           0 :   NODE_NAME_CASE(DWORDADDR)
    3891           0 :   NODE_NAME_CASE(FRACT)
    3892           0 :   NODE_NAME_CASE(SETCC)
    3893           0 :   NODE_NAME_CASE(SETREG)
    3894           0 :   NODE_NAME_CASE(FMA_W_CHAIN)
    3895           0 :   NODE_NAME_CASE(FMUL_W_CHAIN)
    3896           0 :   NODE_NAME_CASE(CLAMP)
    3897           0 :   NODE_NAME_CASE(COS_HW)
    3898           0 :   NODE_NAME_CASE(SIN_HW)
    3899           0 :   NODE_NAME_CASE(FMAX_LEGACY)
    3900           0 :   NODE_NAME_CASE(FMIN_LEGACY)
    3901           0 :   NODE_NAME_CASE(FMAX3)
    3902           0 :   NODE_NAME_CASE(SMAX3)
    3903           0 :   NODE_NAME_CASE(UMAX3)
    3904           0 :   NODE_NAME_CASE(FMIN3)
    3905           0 :   NODE_NAME_CASE(SMIN3)
    3906           0 :   NODE_NAME_CASE(UMIN3)
    3907           0 :   NODE_NAME_CASE(FMED3)
    3908           0 :   NODE_NAME_CASE(SMED3)
    3909           0 :   NODE_NAME_CASE(UMED3)
    3910           0 :   NODE_NAME_CASE(URECIP)
    3911           0 :   NODE_NAME_CASE(DIV_SCALE)
    3912           0 :   NODE_NAME_CASE(DIV_FMAS)
    3913           0 :   NODE_NAME_CASE(DIV_FIXUP)
    3914           0 :   NODE_NAME_CASE(FMAD_FTZ)
    3915           0 :   NODE_NAME_CASE(TRIG_PREOP)
    3916           0 :   NODE_NAME_CASE(RCP)
    3917           0 :   NODE_NAME_CASE(RSQ)
    3918           0 :   NODE_NAME_CASE(RCP_LEGACY)
    3919           0 :   NODE_NAME_CASE(RSQ_LEGACY)
    3920           0 :   NODE_NAME_CASE(FMUL_LEGACY)
    3921           0 :   NODE_NAME_CASE(RSQ_CLAMP)
    3922           0 :   NODE_NAME_CASE(LDEXP)
    3923           0 :   NODE_NAME_CASE(FP_CLASS)
    3924           0 :   NODE_NAME_CASE(DOT4)
    3925           0 :   NODE_NAME_CASE(CARRY)
    3926           0 :   NODE_NAME_CASE(BORROW)
    3927           0 :   NODE_NAME_CASE(BFE_U32)
    3928           0 :   NODE_NAME_CASE(BFE_I32)
    3929           0 :   NODE_NAME_CASE(BFI)
    3930           0 :   NODE_NAME_CASE(BFM)
    3931           0 :   NODE_NAME_CASE(FFBH_U32)
    3932           0 :   NODE_NAME_CASE(FFBH_I32)
    3933           0 :   NODE_NAME_CASE(FFBL_B32)
    3934           0 :   NODE_NAME_CASE(MUL_U24)
    3935           0 :   NODE_NAME_CASE(MUL_I24)
    3936           0 :   NODE_NAME_CASE(MULHI_U24)
    3937           0 :   NODE_NAME_CASE(MULHI_I24)
    3938           0 :   NODE_NAME_CASE(MUL_LOHI_U24)
    3939           0 :   NODE_NAME_CASE(MUL_LOHI_I24)
    3940           0 :   NODE_NAME_CASE(MAD_U24)
    3941           0 :   NODE_NAME_CASE(MAD_I24)
    3942           0 :   NODE_NAME_CASE(MAD_I64_I32)
    3943           0 :   NODE_NAME_CASE(MAD_U64_U32)
    3944           0 :   NODE_NAME_CASE(TEXTURE_FETCH)
    3945           0 :   NODE_NAME_CASE(EXPORT)
    3946           0 :   NODE_NAME_CASE(EXPORT_DONE)
    3947           0 :   NODE_NAME_CASE(R600_EXPORT)
    3948           0 :   NODE_NAME_CASE(CONST_ADDRESS)
    3949           0 :   NODE_NAME_CASE(REGISTER_LOAD)
    3950           0 :   NODE_NAME_CASE(REGISTER_STORE)
    3951           0 :   NODE_NAME_CASE(SAMPLE)
    3952           0 :   NODE_NAME_CASE(SAMPLEB)
    3953           0 :   NODE_NAME_CASE(SAMPLED)
    3954           0 :   NODE_NAME_CASE(SAMPLEL)
    3955           0 :   NODE_NAME_CASE(CVT_F32_UBYTE0)
    3956           0 :   NODE_NAME_CASE(CVT_F32_UBYTE1)
    3957           0 :   NODE_NAME_CASE(CVT_F32_UBYTE2)
    3958           0 :   NODE_NAME_CASE(CVT_F32_UBYTE3)
    3959           0 :   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
    3960           0 :   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
    3961           0 :   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
    3962           0 :   NODE_NAME_CASE(CVT_PK_I16_I32)
    3963           0 :   NODE_NAME_CASE(CVT_PK_U16_U32)
    3964           0 :   NODE_NAME_CASE(FP_TO_FP16)
    3965           0 :   NODE_NAME_CASE(FP16_ZEXT)
    3966           0 :   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
    3967           0 :   NODE_NAME_CASE(CONST_DATA_PTR)
    3968           0 :   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
    3969           0 :   NODE_NAME_CASE(KILL)
    3970           0 :   NODE_NAME_CASE(DUMMY_CHAIN)
    3971             :   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
    3972           0 :   NODE_NAME_CASE(INIT_EXEC)
    3973           0 :   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
    3974           0 :   NODE_NAME_CASE(SENDMSG)
    3975           0 :   NODE_NAME_CASE(SENDMSGHALT)
    3976           0 :   NODE_NAME_CASE(INTERP_MOV)
    3977           0 :   NODE_NAME_CASE(INTERP_P1)
    3978           0 :   NODE_NAME_CASE(INTERP_P2)
    3979           0 :   NODE_NAME_CASE(STORE_MSKOR)
    3980           0 :   NODE_NAME_CASE(LOAD_CONSTANT)
    3981           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
    3982           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
    3983           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
    3984           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
    3985           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
    3986           0 :   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
    3987           0 :   NODE_NAME_CASE(ATOMIC_INC)
    3988           0 :   NODE_NAME_CASE(ATOMIC_DEC)
    3989           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FADD)
    3990           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
    3991           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
    3992           0 :   NODE_NAME_CASE(BUFFER_LOAD)
    3993           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
    3994           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
    3995           0 :   NODE_NAME_CASE(BUFFER_STORE)
    3996           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
    3997           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
    3998           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
    3999           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
    4000           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
    4001           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
    4002           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
    4003           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
    4004           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
    4005           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
    4006           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
    4007           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
    4008           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
    4009           0 :   NODE_NAME_CASE(IMAGE_LOAD)
    4010           0 :   NODE_NAME_CASE(IMAGE_LOAD_MIP)
    4011           0 :   NODE_NAME_CASE(IMAGE_STORE)
    4012           0 :   NODE_NAME_CASE(IMAGE_STORE_MIP)
    4013             :   // Basic sample.
    4014           0 :   NODE_NAME_CASE(IMAGE_SAMPLE)
    4015           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CL)
    4016           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D)
    4017           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_CL)
    4018           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_L)
    4019           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B)
    4020           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_CL)
    4021           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_LZ)
    4022           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD)
    4023           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL)
    4024             :   // Sample with comparison.
    4025           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C)
    4026           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CL)
    4027           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D)
    4028           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL)
    4029           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_L)
    4030           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B)
    4031           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL)
    4032           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ)
    4033           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD)
    4034           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL)
    4035             :   // Sample with offsets.
    4036           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_O)
    4037           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CL_O)
    4038           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_O)
    4039           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_CL_O)
    4040           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_L_O)
    4041           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_O)
    4042           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_CL_O)
    4043           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_LZ_O)
    4044           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_O)
    4045           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL_O)
    4046             :   // Sample with comparison and offsets.
    4047           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_O)
    4048           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CL_O)
    4049           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_O)
    4050           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL_O)
    4051           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_L_O)
    4052           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_O)
    4053           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL_O)
    4054           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ_O)
    4055           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_O)
    4056           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL_O)
    4057             :   // Basic gather4.
    4058           0 :   NODE_NAME_CASE(IMAGE_GATHER4)
    4059           0 :   NODE_NAME_CASE(IMAGE_GATHER4_CL)
    4060           0 :   NODE_NAME_CASE(IMAGE_GATHER4_L)
    4061           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B)
    4062           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_CL)
    4063           0 :   NODE_NAME_CASE(IMAGE_GATHER4_LZ)
    4064             :   // Gather4 with comparison.
    4065           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C)
    4066           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_CL)
    4067           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_L)
    4068           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B)
    4069           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL)
    4070           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_LZ)
    4071             :   // Gather4 with offsets.
    4072           0 :   NODE_NAME_CASE(IMAGE_GATHER4_O)
    4073           0 :   NODE_NAME_CASE(IMAGE_GATHER4_CL_O)
    4074           0 :   NODE_NAME_CASE(IMAGE_GATHER4_L_O)
    4075           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_O)
    4076           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_CL_O)
    4077           0 :   NODE_NAME_CASE(IMAGE_GATHER4_LZ_O)
    4078             :   // Gather4 with comparison and offsets.
    4079           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_O)
    4080           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_CL_O)
    4081           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_L_O)
    4082           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_O)
    4083           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL_O)
    4084           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_LZ_O)
    4085             : 
    4086             :   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
    4087             :   }
    4088           0 :   return nullptr;
    4089             : }
    4090             : 
    4091           8 : SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
    4092             :                                               SelectionDAG &DAG, int Enabled,
    4093             :                                               int &RefinementSteps,
    4094             :                                               bool &UseOneConstNR,
    4095             :                                               bool Reciprocal) const {
    4096           8 :   EVT VT = Operand.getValueType();
    4097             : 
    4098             :   if (VT == MVT::f32) {
    4099           5 :     RefinementSteps = 0;
    4100          10 :     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
    4101             :   }
    4102             : 
    4103             :   // TODO: There is also f64 rsq instruction, but the documentation is less
    4104             :   // clear on its precision.
    4105             : 
    4106           3 :   return SDValue();
    4107             : }
    4108             : 
    4109          12 : SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
    4110             :                                                SelectionDAG &DAG, int Enabled,
    4111             :                                                int &RefinementSteps) const {
    4112          12 :   EVT VT = Operand.getValueType();
    4113             : 
    4114             :   if (VT == MVT::f32) {
    4115             :     // Reciprocal, < 1 ulp error.
    4116             :     //
    4117             :     // This reciprocal approximation converges to < 0.5 ulp error with one
    4118             :     // newton rhapson performed with two fused multiple adds (FMAs).
    4119             : 
    4120           7 :     RefinementSteps = 0;
    4121          14 :     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
    4122             :   }
    4123             : 
    4124             :   // TODO: There is also f64 rcp instruction, but the documentation is less
    4125             :   // clear on its precision.
    4126             : 
    4127           5 :   return SDValue();
    4128             : }
    4129             : 
    4130      130842 : void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
    4131             :     const SDValue Op, KnownBits &Known,
    4132             :     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
    4133             : 
    4134             :   Known.resetAll(); // Don't know anything.
    4135             : 
    4136             :   unsigned Opc = Op.getOpcode();
    4137             : 
    4138      130842 :   switch (Opc) {
    4139             :   default:
    4140             :     break;
    4141        8428 :   case AMDGPUISD::CARRY:
    4142             :   case AMDGPUISD::BORROW: {
    4143       16856 :     Known.Zero = APInt::getHighBitsSet(32, 31);
    4144        8428 :     break;
    4145             :   }
    4146             : 
    4147         527 :   case AMDGPUISD::BFE_I32:
    4148             :   case AMDGPUISD::BFE_U32: {
    4149             :     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4150             :     if (!CWidth)
    4151             :       return;
    4152             : 
    4153        1054 :     uint32_t Width = CWidth->getZExtValue() & 0x1f;
    4154             : 
    4155         527 :     if (Opc == AMDGPUISD::BFE_U32)
    4156         990 :       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
    4157             : 
    4158             :     break;
    4159             :   }
    4160             :   case AMDGPUISD::FP_TO_FP16:
    4161             :   case AMDGPUISD::FP16_ZEXT: {
    4162             :     unsigned BitWidth = Known.getBitWidth();
    4163             : 
    4164             :     // High bits are zero.
    4165        2788 :     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
    4166        1394 :     break;
    4167             :   }
    4168             :   case AMDGPUISD::MUL_U24:
    4169             :   case AMDGPUISD::MUL_I24: {
    4170        8825 :     KnownBits LHSKnown, RHSKnown;
    4171       17650 :     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
    4172        8825 :     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
    4173             : 
    4174        8825 :     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
    4175        8825 :                       RHSKnown.countMinTrailingZeros();
    4176       26475 :     Known.Zero.setLowBits(std::min(TrailZ, 32u));
    4177             : 
    4178       17650 :     unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
    4179       17650 :     unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
    4180       17650 :     unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
    4181        8825 :     if (MaxValBits >= 32)
    4182             :       break;
    4183             :     bool Negative = false;
    4184        5019 :     if (Opc == AMDGPUISD::MUL_I24) {
    4185          10 :       bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
    4186          10 :       bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
    4187          10 :       bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
    4188          10 :       bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
    4189           5 :       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
    4190             :         break;
    4191           5 :       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
    4192             :     }
    4193             :     if (Negative)
    4194           0 :       Known.One.setHighBits(32 - MaxValBits);
    4195             :     else
    4196        5019 :       Known.Zero.setHighBits(32 - MaxValBits);
    4197             :     break;
    4198             :   }
    4199       68956 :   case ISD::INTRINSIC_WO_CHAIN: {
    4200      137912 :     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4201       68956 :     switch (IID) {
    4202        5554 :     case Intrinsic::amdgcn_mbcnt_lo:
    4203             :     case Intrinsic::amdgcn_mbcnt_hi: {
    4204             :       // These return at most the wavefront size - 1.
    4205       11108 :       unsigned Size = Op.getValueType().getSizeInBits();
    4206       11108 :       Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
    4207             :       break;
    4208             :     }
    4209             :     default:
    4210             :       break;
    4211             :     }
    4212             :   }
    4213             :   }
    4214             : }
    4215             : 
    4216        1657 : unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
    4217             :     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
    4218             :     unsigned Depth) const {
    4219        1657 :   switch (Op.getOpcode()) {
    4220           2 :   case AMDGPUISD::BFE_I32: {
    4221             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4222             :     if (!Width)
    4223             :       return 1;
    4224             : 
    4225           4 :     unsigned SignBits = 32 - Width->getZExtValue() + 1;
    4226           2 :     if (!isNullConstant(Op.getOperand(1)))
    4227             :       return SignBits;
    4228             : 
    4229             :     // TODO: Could probably figure something out with non-0 offsets.
    4230           0 :     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
    4231           0 :     return std::max(SignBits, Op0SignBits);
    4232             :   }
    4233             : 
    4234           0 :   case AMDGPUISD::BFE_U32: {
    4235             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4236           0 :     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
    4237             :   }
    4238             : 
    4239             :   case AMDGPUISD::CARRY:
    4240             :   case AMDGPUISD::BORROW:
    4241             :     return 31;
    4242           1 :   case AMDGPUISD::FP_TO_FP16:
    4243             :   case AMDGPUISD::FP16_ZEXT:
    4244           1 :     return 16;
    4245         956 :   default:
    4246         956 :     return 1;
    4247             :   }
    4248             : }

Generated by: LCOV version 1.13